Creating a Master Data Set

Start with Bitcoin Data

Data File Description: * Sourced: Coinbase * Consists of Bitcoin prices from November 2014 - November 2020 * 2,182 Data Points: this a large enough data set to provide analysis Import Dataset

# Read CSV data into R
btc_data <- read.csv("Coinbase_BTCUSD_d.csv", header = TRUE)

#number of rows; there are 2182 data points. 
dim(btc_data)
## [1] 2182    9
# Display the first 6 elements to ensure that the data is read
head(btc_data)
##    Timestamp       Date Symbol     Open     High      Low    Close Volume.BTC
## 1 1605830400 2020-11-20 BTCUSD 17821.58 18239.00 17764.76 18142.52    3909.44
## 2 1605744000 2020-11-19 BTCUSD 17782.91 18193.29 17356.00 17821.58   17141.49
## 3 1605657600 2020-11-18 BTCUSD 17679.36 18488.00 17205.02 17782.91   32425.64
## 4 1605571200 2020-11-17 BTCUSD 16726.64 17880.00 16575.42 17679.36   25230.04
## 5 1605484800 2020-11-16 BTCUSD 15966.89 16892.00 15879.00 16726.64   13948.06
## 6 1605398400 2020-11-15 BTCUSD 16082.01 16175.60 15796.09 15966.89    6250.08
##   Volume.USD
## 1   70437003
## 2  306201498
## 3  579119955
## 4  436549314
## 5  230076772
## 6   99871183
tail(btc_data)
##       Timestamp       Date Symbol  Open  High    Low Close Volume.BTC
## 2177 1417824000 2014-12-06 BTCUSD 377.1 378.0 377.10 378.0    0.01500
## 2178 1417737600 2014-12-05 BTCUSD 377.1 377.1 377.10 377.1    0.00000
## 2179 1417651200 2014-12-04 BTCUSD 378.0 378.0 377.10 377.1    0.01000
## 2180 1417564800 2014-12-03 BTCUSD 378.0 378.0 377.01 378.0    0.54660
## 2181 1417478400 2014-12-02 BTCUSD 370.0 378.0 370.00 378.0   15.01000
## 2182 1417392000 2014-12-01 BTCUSD 300.0 370.0 300.00 370.0    0.05656
##      Volume.USD
## 2177       5.67
## 2178       0.00
## 2179       3.77
## 2180     206.52
## 2181    5675.07
## 2182      19.53

Order the rows by ascending order

# Order rows by date
btc = btc_data[order(btc_data$Date),]

# Display order
head(btc)
##       Timestamp       Date Symbol  Open  High    Low Close Volume.BTC
## 2182 1417392000 2014-12-01 BTCUSD 300.0 370.0 300.00 370.0    0.05656
## 2181 1417478400 2014-12-02 BTCUSD 370.0 378.0 370.00 378.0   15.01000
## 2180 1417564800 2014-12-03 BTCUSD 378.0 378.0 377.01 378.0    0.54660
## 2179 1417651200 2014-12-04 BTCUSD 378.0 378.0 377.10 377.1    0.01000
## 2178 1417737600 2014-12-05 BTCUSD 377.1 377.1 377.10 377.1    0.00000
## 2177 1417824000 2014-12-06 BTCUSD 377.1 378.0 377.10 378.0    0.01500
##      Volume.USD
## 2182      19.53
## 2181    5675.07
## 2180     206.52
## 2179       3.77
## 2178       0.00
## 2177       5.67

Clean the data further * Date was converted from Chr format to Date format * Symbol Column was removed

# Convert Date variable into date format
btc$Date <- as.Date(btc$Date, format = "%Y-%m-%d")
head(btc)
##       Timestamp       Date Symbol  Open  High    Low Close Volume.BTC
## 2182 1417392000 2014-12-01 BTCUSD 300.0 370.0 300.00 370.0    0.05656
## 2181 1417478400 2014-12-02 BTCUSD 370.0 378.0 370.00 378.0   15.01000
## 2180 1417564800 2014-12-03 BTCUSD 378.0 378.0 377.01 378.0    0.54660
## 2179 1417651200 2014-12-04 BTCUSD 378.0 378.0 377.10 377.1    0.01000
## 2178 1417737600 2014-12-05 BTCUSD 377.1 377.1 377.10 377.1    0.00000
## 2177 1417824000 2014-12-06 BTCUSD 377.1 378.0 377.10 378.0    0.01500
##      Volume.USD
## 2182      19.53
## 2181    5675.07
## 2180     206.52
## 2179       3.77
## 2178       0.00
## 2177       5.67
# Remove symbol
btc = subset(btc, select = -c(Symbol))
head(btc)
##       Timestamp       Date  Open  High    Low Close Volume.BTC Volume.USD
## 2182 1417392000 2014-12-01 300.0 370.0 300.00 370.0    0.05656      19.53
## 2181 1417478400 2014-12-02 370.0 378.0 370.00 378.0   15.01000    5675.07
## 2180 1417564800 2014-12-03 378.0 378.0 377.01 378.0    0.54660     206.52
## 2179 1417651200 2014-12-04 378.0 378.0 377.10 377.1    0.01000       3.77
## 2178 1417737600 2014-12-05 377.1 377.1 377.10 377.1    0.00000       0.00
## 2177 1417824000 2014-12-06 377.1 378.0 377.10 378.0    0.01500       5.67

Close price of the next day * Closing price is an accurate representation of overall price and is slightly less volatile than daily high or low prices * We decided not to use this variable, however, to improve our model in the future this variable could be used to predict future price of bitcoin rather than merely future increase/decrease of bitcoin

#btc$Close.nextday = 0
#test_var <- btc$Close
#column_data_close_price <- 0

#for(i in 1:length(test_var)) {
  #column_data_close_price[i] <- test_var[i+1]
#}
#btc$Close.nextday = column_data_close_price

#head(btc)

Create a binary close variable (H/L) * This binary variable indicated if tomorrows prices increases (H) or decreases/stayed the same (L)

btc$HL.Close = 0 
test_var <- btc$Close
column_data_close_HL <- 0

for(i in 1:length(test_var)) {
  
  if(isTRUE(test_var[i] > test_var[i+1])) {
    column_data_close_HL[i] <- 0 #L
  }
  else if(isTRUE(test_var[i] == test_var[i+1])) {
    column_data_close_HL[i] <- 0 #L
  }
  else{
    column_data_close_HL[i] <- 1 #H
  }
}

btc$HL.Close = column_data_close_HL
p <- btc$HL.Close

head(btc)
##       Timestamp       Date  Open  High    Low Close Volume.BTC Volume.USD
## 2182 1417392000 2014-12-01 300.0 370.0 300.00 370.0    0.05656      19.53
## 2181 1417478400 2014-12-02 370.0 378.0 370.00 378.0   15.01000    5675.07
## 2180 1417564800 2014-12-03 378.0 378.0 377.01 378.0    0.54660     206.52
## 2179 1417651200 2014-12-04 378.0 378.0 377.10 377.1    0.01000       3.77
## 2178 1417737600 2014-12-05 377.1 377.1 377.10 377.1    0.00000       0.00
## 2177 1417824000 2014-12-06 377.1 378.0 377.10 378.0    0.01500       5.67
##      HL.Close
## 2182        1
## 2181        0
## 2180        0
## 2179        0
## 2178        1
## 2177        0

##Cyptocurrencies Section

Ethereum Data

Data Description * Ethereum Data prices from May 27 2016- November 11 2020 * Source: Coinbase

# Read CSV data into R
eth_data <- read.csv("Coinbase_ETHUSD_d.csv", header = TRUE)

# Order rows by date
eth = eth_data[order(eth_data$Date),]

# Remove Timestamp & Symbol
eth = subset(eth, select = -c(Unix.Timestamp, Symbol))

# Convert Date factor in date format
eth$Date <- as.Date(eth$Date, format = "%Y-%m-%d")

# Remove the Open, High and Low variables
eth <- subset(eth, select = -c(Open, High, Low, Volume.ETH))

# Rename Close variable to Price
names(eth)[names(eth) == "Close"] <- "ETH.Price"

# Rename Volume.USD to Currency
names(eth)[names(eth) == "Volume.USD"] <- "ETH.Volume"

# Display the first and last 6 elements to ensure that the data is read properly
head(eth)
##            Date ETH.Price ETH.Volume
## 1639 2016-05-27     11.25  151147.98
## 1638 2016-05-28     11.93  180822.02
## 1637 2016-05-29     12.34   42228.37
## 1636 2016-05-30     12.41   51655.95
## 1635 2016-05-31     14.00   76994.75
## 1634 2016-06-01     13.93  145746.12
tail(eth)
##         Date ETH.Price ETH.Volume
## 6 2020-11-15    448.58   40271351
## 5 2020-11-16    460.85   51758620
## 4 2020-11-17    482.68   93082972
## 3 2020-11-18    478.96  141725015
## 2 2020-11-19    471.92   62514644
## 1 2020-11-20    484.88   29558467

Litecoin Data

Data Description * Litecoin Data prices from August 23 2016- November 15 2020 * Source: Coinbase

# Read CSV data into R
ltc_data <- read.csv("Coinbase_LTCUSD_d.csv", header = TRUE)

# Order rows by date
ltc = ltc_data[order(ltc_data$Date),]

# Remove Timestamp & Symbol
ltc = subset(ltc, select = -c(Unix.Timestamp, Symbol))

# Convert Date factor into date format
ltc$Date <- as.Date(ltc$Date, format = "%Y-%m-%d")

# Remove the Open, High and Low variables
ltc <- subset(ltc, select = -c(Open, High, Low, Volume.LTC))

# Rename Close variable to Price
names(ltc)[names(ltc) == "Close"] <- "LTC.Price"

# Rename Volume.USD to Currency
names(ltc)[names(ltc) == "Volume.USD"] <- "LTC.Volume"

# Display the first and last 6 elements to ensure that the data is read properly
head(ltc)
##            Date LTC.Price LTC.Volume
## 1551 2016-08-23      3.95    1737.31
## 1550 2016-08-24      3.84   19247.53
## 1549 2016-08-25      3.81   19276.60
## 1548 2016-08-26      3.81   12746.27
## 1547 2016-08-27      3.78    4295.72
## 1546 2016-08-28      3.72    7111.87
tail(ltc)
##         Date LTC.Price LTC.Volume
## 6 2020-11-15     62.37    8286862
## 5 2020-11-16     73.83   43992549
## 4 2020-11-17     76.41   45138596
## 3 2020-11-18     73.48   48863408
## 2 2020-11-19     81.64   63647825
## 1 2020-11-20     81.22   13261137

Ripple Data

Data Description * Ripple Data prices from January 17 2017- November 15 2020 * Source: Coinbase

# Read CSV data into R
xrp_data <- read.csv("Bitstamp_XRPUSD_d.csv", header = TRUE)

# Order rows by date
xrp = xrp_data[order(xrp_data$Date),]

# Remove Timestamp & Symbol
xrp = subset(xrp, select = -c(Unix.Timestamp, Symbol))

# Convert Date factor into date format
xrp$Date <- as.Date(xrp$Date, format = "%Y-%m-%d")

# Remove the Open, High and Low variables
xrp <- subset(xrp, select = -c(Open, High, Low, Volume.XRP))

# Rename Close variable to Price
names(xrp)[names(xrp) == "Close"] <- "XRP.Price"

# Rename Volume.USD to Currency
names(xrp)[names(xrp) == "Volume.USD"] <- "XRP.Volume"

# Display the first and last 6 elements to ensure that the data is read properly
head(xrp)
##            Date XRP.Price XRP.Volume
## 1404 2017-01-17   0.00683   30673.69
## 1403 2017-01-18   0.00680   38018.93
## 1402 2017-01-19   0.00684   19882.33
## 1401 2017-01-20   0.00660   11374.15
## 1400 2017-01-21   0.00684   13955.92
## 1399 2017-01-22   0.00678    3878.76
tail(xrp)
##         Date XRP.Price XRP.Volume
## 6 2020-11-15    0.2697   14292147
## 5 2020-11-16    0.2880   23107272
## 4 2020-11-17    0.3026   35535529
## 3 2020-11-18    0.2937   40662719
## 2 2020-11-19    0.3044   34206680
## 1 2020-11-20    0.3005    8675902
# Count the number of rows since this is the shortest data time frame
dim(xrp)[1]
## [1] 1404

Data Cleaning

Filter Rows for Consistency * Due to difference in start of the crypto data points and the BTC data points, rows will need to be removed to be aligned + This will weaken the model as it is removing 789 data points

# Total number of rows (from XRP database since it has the fewest historical data points)
c_rows <- dim(xrp)[1] -12 #TBD : why the 12?

# Format Bitcoin
btc_c <- tail(btc,n=c_rows)
head(btc_c)
##       Timestamp       Date    Open    High    Low   Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29  924.70  927.47 915.00  917.31    2498.61    2303176
## 1391 1485734400 2017-01-30  917.31  923.95 914.69  923.45    3678.36    3385239
## 1390 1485820800 2017-01-31  923.45  971.24 922.83  970.92    6624.94    6298154
## 1389 1485907200 2017-02-01  970.92  991.38 963.84  989.71    5983.96    5835317
## 1388 1485993600 2017-02-02  989.71 1010.00 978.74 1007.66    5623.69    5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77    6731.61    6815466
##      HL.Close
## 1392        1
## 1391        1
## 1390        1
## 1389        1
## 1388        1
## 1387        1
tail(btc_c)
##    Timestamp       Date     Open     High      Low    Close Volume.BTC
## 6 1605398400 2020-11-15 16082.01 16175.60 15796.09 15966.89    6250.08
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64   13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36   25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91   32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58   17141.49
## 1 1605830400 2020-11-20 17821.58 18239.00 17764.76 18142.52    3909.44
##   Volume.USD HL.Close
## 6   99871183        1
## 5  230076772        1
## 4  436549314        1
## 3  579119955        1
## 2  306201498        1
## 1   70437003        1
# Ethereum
eth <- tail(eth,n=c_rows)
head(eth)
##            Date ETH.Price ETH.Volume
## 1392 2017-01-29     10.50   189086.3
## 1391 2017-01-30     10.59   437746.5
## 1390 2017-01-31     10.74   413350.2
## 1389 2017-02-01     10.73   630953.7
## 1388 2017-02-02     10.82   513774.8
## 1387 2017-02-03     10.95   531755.4
tail(eth)
##         Date ETH.Price ETH.Volume
## 6 2020-11-15    448.58   40271351
## 5 2020-11-16    460.85   51758620
## 4 2020-11-17    482.68   93082972
## 3 2020-11-18    478.96  141725015
## 2 2020-11-19    471.92   62514644
## 1 2020-11-20    484.88   29558467
# Litecoin
ltc <- tail(ltc,n=c_rows)
head(ltc)
##            Date LTC.Price LTC.Volume
## 1392 2017-01-29      3.88    2151.59
## 1391 2017-01-30      4.03   23569.63
## 1390 2017-01-31      4.07   35332.93
## 1389 2017-02-01      4.08   17621.75
## 1388 2017-02-02      4.09   17753.63
## 1387 2017-02-03      4.06   15202.71
tail(ltc)
##         Date LTC.Price LTC.Volume
## 6 2020-11-15     62.37    8286862
## 5 2020-11-16     73.83   43992549
## 4 2020-11-17     76.41   45138596
## 3 2020-11-18     73.48   48863408
## 2 2020-11-19     81.64   63647825
## 1 2020-11-20     81.22   13261137
# Ripple
xrp <- tail(xrp,n=c_rows)
head(xrp)
##            Date XRP.Price XRP.Volume
## 1392 2017-01-29   0.00631     380.92
## 1391 2017-01-30   0.00645    3249.53
## 1390 2017-01-31   0.00641   13926.48
## 1389 2017-02-01   0.00649   13118.79
## 1388 2017-02-02   0.00640   13887.87
## 1387 2017-02-03   0.00638   12139.60
tail(xrp)
##         Date XRP.Price XRP.Volume
## 6 2020-11-15    0.2697   14292147
## 5 2020-11-16    0.2880   23107272
## 4 2020-11-17    0.3026   35535529
## 3 2020-11-18    0.2937   40662719
## 2 2020-11-19    0.3044   34206680
## 1 2020-11-20    0.3005    8675902

Remove All but One Date Variable & Merge Datasets * Now the master data file starts on 2017 January 29 - 2020 November 20

# Ethereum
eth <- subset(eth, select = -c(Date))

# Litecoin
ltc <- subset(ltc, select = -c(Date))

# Ripple
xrp <- subset(xrp, select = -c(Date))

# Merge the Data Frames
coins <- cbind(btc_c, eth, ltc, xrp)

head(coins)
##       Timestamp       Date    Open    High    Low   Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29  924.70  927.47 915.00  917.31    2498.61    2303176
## 1391 1485734400 2017-01-30  917.31  923.95 914.69  923.45    3678.36    3385239
## 1390 1485820800 2017-01-31  923.45  971.24 922.83  970.92    6624.94    6298154
## 1389 1485907200 2017-02-01  970.92  991.38 963.84  989.71    5983.96    5835317
## 1388 1485993600 2017-02-02  989.71 1010.00 978.74 1007.66    5623.69    5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77    6731.61    6815466
##      HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392        1     10.50   189086.3      3.88    2151.59   0.00631     380.92
## 1391        1     10.59   437746.5      4.03   23569.63   0.00645    3249.53
## 1390        1     10.74   413350.2      4.07   35332.93   0.00641   13926.48
## 1389        1     10.73   630953.7      4.08   17621.75   0.00649   13118.79
## 1388        1     10.82   513774.8      4.09   17753.63   0.00640   13887.87
## 1387        1     10.95   531755.4      4.06   15202.71   0.00638   12139.60

Creating master data frame

# Rename frame for consistency
master = coins

head(master)
##       Timestamp       Date    Open    High    Low   Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29  924.70  927.47 915.00  917.31    2498.61    2303176
## 1391 1485734400 2017-01-30  917.31  923.95 914.69  923.45    3678.36    3385239
## 1390 1485820800 2017-01-31  923.45  971.24 922.83  970.92    6624.94    6298154
## 1389 1485907200 2017-02-01  970.92  991.38 963.84  989.71    5983.96    5835317
## 1388 1485993600 2017-02-02  989.71 1010.00 978.74 1007.66    5623.69    5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77    6731.61    6815466
##      HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392        1     10.50   189086.3      3.88    2151.59   0.00631     380.92
## 1391        1     10.59   437746.5      4.03   23569.63   0.00645    3249.53
## 1390        1     10.74   413350.2      4.07   35332.93   0.00641   13926.48
## 1389        1     10.73   630953.7      4.08   17621.75   0.00649   13118.79
## 1388        1     10.82   513774.8      4.09   17753.63   0.00640   13887.87
## 1387        1     10.95   531755.4      4.06   15202.71   0.00638   12139.60
tail(master)
##    Timestamp       Date     Open     High      Low    Close Volume.BTC
## 6 1605398400 2020-11-15 16082.01 16175.60 15796.09 15966.89    6250.08
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64   13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36   25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91   32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58   17141.49
## 1 1605830400 2020-11-20 17821.58 18239.00 17764.76 18142.52    3909.44
##   Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 6   99871183        1    448.58   40271351     62.37    8286862    0.2697
## 5  230076772        1    460.85   51758620     73.83   43992549    0.2880
## 4  436549314        1    482.68   93082972     76.41   45138596    0.3026
## 3  579119955        1    478.96  141725015     73.48   48863408    0.2937
## 2  306201498        1    471.92   62514644     81.64   63647825    0.3044
## 1   70437003        1    484.88   29558467     81.22   13261137    0.3005
##   XRP.Volume
## 6   14292147
## 5   23107272
## 4   35535529
## 3   40662719
## 2   34206680
## 1    8675902
dim(master )
## [1] 1392   15

Add Additional asset classes

S+P 500

Data Description: * Price of SP 500 from 2017-2020 * Source: Yahoo Finance * Since the stock market is only open on the weekdays, the data file does not consist of weekend values * We will remove weekends on the master file to ensure alignment of asset classes data * This will weaken the model as it reduce the data frame size by ~ 104 data points

SP500 <- read.csv("^GSPC.csv", header = TRUE)
SP500 = SP500[order(SP500$Date),]
SP500$Date <- as.Date(SP500$Date, format = "%Y-%m-%d")

# Remove the Open, High and Low variables
SP500 <- subset(SP500, select = -c(Open, High, Low, Adj.Close))

# Rename Close variable to Price
names(SP500)[names(SP500) == "Close"] <- "SP500.Price"

# Rename Volume.USD to Currency
names(SP500)[names(SP500) == "Volume"] <- "SP500.Volume"

head(SP500)
##         Date SP500.Price SP500.Volume
## 1 2017-01-30     2280.90   3591270000
## 2 2017-01-31     2278.87   4087450000
## 3 2017-02-01     2279.55   3916610000
## 4 2017-02-02     2280.85   3807710000
## 5 2017-02-03     2297.42   3597970000
## 6 2017-02-06     2292.56   3109050000
tail(SP500)
##           Date SP500.Price SP500.Volume
## 956 2020-11-12     3537.01   4890120000
## 957 2020-11-13     3585.15   4709670000
## 958 2020-11-16     3626.91   5281980000
## 959 2020-11-17     3609.53   4799570000
## 960 2020-11-18     3567.79   5274450000
## 961 2020-11-19     3581.87   4347200000
dim(SP500)
## [1] 961   3

Convert master data frame to only include week days * Search through the master file, find S and P 500 date, and insert the S and P 500 Price and Volume for the associated date * This is an inefficient method to search, to improve this potentially using a linear search as the data is already sorted

master$SP500.Price = 0
master$SP500.Volume =0
head(master)
##       Timestamp       Date    Open    High    Low   Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29  924.70  927.47 915.00  917.31    2498.61    2303176
## 1391 1485734400 2017-01-30  917.31  923.95 914.69  923.45    3678.36    3385239
## 1390 1485820800 2017-01-31  923.45  971.24 922.83  970.92    6624.94    6298154
## 1389 1485907200 2017-02-01  970.92  991.38 963.84  989.71    5983.96    5835317
## 1388 1485993600 2017-02-02  989.71 1010.00 978.74 1007.66    5623.69    5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77    6731.61    6815466
##      HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392        1     10.50   189086.3      3.88    2151.59   0.00631     380.92
## 1391        1     10.59   437746.5      4.03   23569.63   0.00645    3249.53
## 1390        1     10.74   413350.2      4.07   35332.93   0.00641   13926.48
## 1389        1     10.73   630953.7      4.08   17621.75   0.00649   13118.79
## 1388        1     10.82   513774.8      4.09   17753.63   0.00640   13887.87
## 1387        1     10.95   531755.4      4.06   15202.71   0.00638   12139.60
##      search$Google_Search_Frequency SP500.Price SP500.Volume
## 1392                              4           0            0
## 1391                              4           0            0
## 1390                              4           0            0
## 1389                              4           0            0
## 1388                              4           0            0
## 1387                              4           0            0
for (master_date in (1:length(master$Date)))
{
  for (SP_date in (1:length(SP500$Date)))
  {
    if(master$Date[master_date] == SP500$Date[SP_date])
    {
      master$SP500.Price[master_date] = SP500$SP500.Price[SP_date]
      master$SP500.Volume[master_date] = SP500$SP500.Volume[SP_date]
      next
    }
  }
}

head(master)
##       Timestamp       Date    Open    High    Low   Close Volume.BTC Volume.USD
## 1392 1485648000 2017-01-29  924.70  927.47 915.00  917.31    2498.61    2303176
## 1391 1485734400 2017-01-30  917.31  923.95 914.69  923.45    3678.36    3385239
## 1390 1485820800 2017-01-31  923.45  971.24 922.83  970.92    6624.94    6298154
## 1389 1485907200 2017-02-01  970.92  991.38 963.84  989.71    5983.96    5835317
## 1388 1485993600 2017-02-02  989.71 1010.00 978.74 1007.66    5623.69    5602317
## 1387 1486080000 2017-02-03 1007.66 1024.50 994.34 1016.77    6731.61    6815466
##      HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume
## 1392        1     10.50   189086.3      3.88    2151.59   0.00631     380.92
## 1391        1     10.59   437746.5      4.03   23569.63   0.00645    3249.53
## 1390        1     10.74   413350.2      4.07   35332.93   0.00641   13926.48
## 1389        1     10.73   630953.7      4.08   17621.75   0.00649   13118.79
## 1388        1     10.82   513774.8      4.09   17753.63   0.00640   13887.87
## 1387        1     10.95   531755.4      4.06   15202.71   0.00638   12139.60
##      search$Google_Search_Frequency SP500.Price SP500.Volume
## 1392                              4        0.00            0
## 1391                              4     2280.90   3591270000
## 1390                              4     2278.87   4087450000
## 1389                              4     2279.55   3916610000
## 1388                              4     2280.85   3807710000
## 1387                              4     2297.42   3597970000
tail(master)
##    Timestamp       Date     Open     High      Low    Close Volume.BTC
## 6 1605398400 2020-11-15 16082.01 16175.60 15796.09 15966.89    6250.08
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64   13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36   25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91   32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58   17141.49
## 1 1605830400 2020-11-20 17821.58 18239.00 17764.76 18142.52    3909.44
##   Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 6   99871183        1    448.58   40271351     62.37    8286862    0.2697
## 5  230076772        1    460.85   51758620     73.83   43992549    0.2880
## 4  436549314        1    482.68   93082972     76.41   45138596    0.3026
## 3  579119955        1    478.96  141725015     73.48   48863408    0.2937
## 2  306201498        1    471.92   62514644     81.64   63647825    0.3044
## 1   70437003        1    484.88   29558467     81.22   13261137    0.3005
##   XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 6   14292147                             22        0.00            0
## 5   23107272                             22     3626.91   5281980000
## 4   35535529                             22     3609.53   4799570000
## 3   40662719                             22     3567.79   5274450000
## 2   34206680                             22     3581.87   4347200000
## 1    8675902                             22        0.00            0

Removing Weekends/Holidays * Remove any values the SP 500 didn’t have * now only has 961 rows of data

master= master[!(master$SP500.Price ==0 & master$SP500.Volume ==0),]

head(master)
##       Timestamp       Date    Open    High     Low   Close Volume.BTC
## 1391 1485734400 2017-01-30  917.31  923.95  914.69  923.45    3678.36
## 1390 1485820800 2017-01-31  923.45  971.24  922.83  970.92    6624.94
## 1389 1485907200 2017-02-01  970.92  991.38  963.84  989.71    5983.96
## 1388 1485993600 2017-02-02  989.71 1010.00  978.74 1007.66    5623.69
## 1387 1486080000 2017-02-03 1007.66 1024.50  994.34 1016.77    6731.61
## 1384 1486339200 2017-02-06 1019.31 1027.70 1014.64 1024.39    4227.33
##      Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 1391    3385239        1     10.59   437746.5      4.03   23569.63   0.00645
## 1390    6298154        1     10.74   413350.2      4.07   35332.93   0.00641
## 1389    5835317        1     10.73   630953.7      4.08   17621.75   0.00649
## 1388    5602317        1     10.82   513774.8      4.09   17753.63   0.00640
## 1387    6815466        1     10.95   531755.4      4.06   15202.71   0.00638
## 1384    4321741        1     11.34   509447.5      3.97    6224.56   0.00638
##      XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 1391    3249.53                              4     2280.90   3591270000
## 1390   13926.48                              4     2278.87   4087450000
## 1389   13118.79                              4     2279.55   3916610000
## 1388   13887.87                              4     2280.85   3807710000
## 1387   12139.60                              4     2297.42   3597970000
## 1384    2697.23                              4     2292.56   3109050000
tail(master)
##    Timestamp       Date     Open     High      Low    Close Volume.BTC
## 9 1605139200 2020-11-12 15705.79 16370.89 15446.82 16310.81   22153.74
## 8 1605225600 2020-11-13 16310.81 16491.92 15975.00 16339.56   14593.52
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64   13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36   25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91   32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58   17141.49
##   Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 9  353634687        1    462.98   83026488     60.74   11126850    0.2551
## 8  237577678        0    477.14   63134324     66.01   30783340    0.2660
## 5  230076772        1    460.85   51758620     73.83   43992549    0.2880
## 4  436549314        1    482.68   93082972     76.41   45138596    0.3026
## 3  579119955        1    478.96  141725015     73.48   48863408    0.2937
## 2  306201498        1    471.92   62514644     81.64   63647825    0.3044
##   XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 9   16698901                             15     3537.01   4890120000
## 8   18763922                             15     3585.15   4709670000
## 5   23107272                             22     3626.91   5281980000
## 4   35535529                             22     3609.53   4799570000
## 3   40662719                             22     3567.79   5274450000
## 2   34206680                             22     3581.87   4347200000
dim(master)
## [1] 961  18

Gold

Data Description * Price of gold from 2017- 2020 * Source: Yahoo Finance

gold <- read.csv("GC=F.csv", header = TRUE)
gold = gold[order(gold$Date),]
gold$Date <- as.Date(gold$Date, format = "%Y-%m-%d")

# Remove the Open, High and Low variables
gold <- subset(gold, select = -c(Open, High, Low, Adj.Close))

# Rename Close variable to Price
names(gold)[names(gold) == "Close"] <- "Gold.Price"

# Rename Volume.USD to Currency
names(gold)[names(gold) == "Volume"] <- "Gold.Volume"

#Remove Nulls 
gold= gold[!(gold$Gold.Price == "null" & gold$Gold.Volume =="null"),]

# Convert price and volume factor into numerical format
gold$Gold.Price = as.numeric(gold$Gold.Price)
gold$Gold.Volume = as.numeric(gold$Gold.Volume)

head(gold)
##         Date Gold.Price Gold.Volume
## 1 2017-01-30     1193.2       50503
## 2 2017-01-31     1208.6        3212
## 3 2017-02-01     1205.6        1145
## 4 2017-02-02     1216.7        1512
## 5 2017-02-03     1218.5         865
## 7 2017-02-06     1230.0         908
tail(gold)
##            Date Gold.Price Gold.Volume
## 1153 2020-11-12     1872.6         220
## 1154 2020-11-13     1885.7         240
## 1156 2020-11-16     1887.3           6
## 1157 2020-11-17     1884.5          59
## 1158 2020-11-18     1873.5         152
## 1159 2020-11-19     1861.1          59
dim(gold)
## [1] 1036    3

Aligning with Master Data Set

master$Gold.Price = 0
master$Gold.Volume = 0

for (master_date in (1:length(master$Date)))
{
  for (i in (1:length(gold$Date)))
  {
    if(master$Date[master_date] == gold$Date[i])
    {
      master$Gold.Price[master_date] = gold$Gold.Price[i]
      master$Gold.Volume[master_date] = gold$Gold.Volume[i]
      next
    }
  }
}

head(master)
##       Timestamp       Date    Open    High     Low   Close Volume.BTC
## 1391 1485734400 2017-01-30  917.31  923.95  914.69  923.45    3678.36
## 1390 1485820800 2017-01-31  923.45  971.24  922.83  970.92    6624.94
## 1389 1485907200 2017-02-01  970.92  991.38  963.84  989.71    5983.96
## 1388 1485993600 2017-02-02  989.71 1010.00  978.74 1007.66    5623.69
## 1387 1486080000 2017-02-03 1007.66 1024.50  994.34 1016.77    6731.61
## 1384 1486339200 2017-02-06 1019.31 1027.70 1014.64 1024.39    4227.33
##      Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 1391    3385239        1     10.59   437746.5      4.03   23569.63   0.00645
## 1390    6298154        1     10.74   413350.2      4.07   35332.93   0.00641
## 1389    5835317        1     10.73   630953.7      4.08   17621.75   0.00649
## 1388    5602317        1     10.82   513774.8      4.09   17753.63   0.00640
## 1387    6815466        1     10.95   531755.4      4.06   15202.71   0.00638
## 1384    4321741        1     11.34   509447.5      3.97    6224.56   0.00638
##      XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 1391    3249.53                              4     2280.90   3591270000
## 1390   13926.48                              4     2278.87   4087450000
## 1389   13118.79                              4     2279.55   3916610000
## 1388   13887.87                              4     2280.85   3807710000
## 1387   12139.60                              4     2297.42   3597970000
## 1384    2697.23                              4     2292.56   3109050000
##      Gold.Price Gold.Volume
## 1391     1193.2       50503
## 1390     1208.6        3212
## 1389     1205.6        1145
## 1388     1216.7        1512
## 1387     1218.5         865
## 1384     1230.0         908
tail(master)
##    Timestamp       Date     Open     High      Low    Close Volume.BTC
## 9 1605139200 2020-11-12 15705.79 16370.89 15446.82 16310.81   22153.74
## 8 1605225600 2020-11-13 16310.81 16491.92 15975.00 16339.56   14593.52
## 5 1605484800 2020-11-16 15966.89 16892.00 15879.00 16726.64   13948.06
## 4 1605571200 2020-11-17 16726.64 17880.00 16575.42 17679.36   25230.04
## 3 1605657600 2020-11-18 17679.36 18488.00 17205.02 17782.91   32425.64
## 2 1605744000 2020-11-19 17782.91 18193.29 17356.00 17821.58   17141.49
##   Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 9  353634687        1    462.98   83026488     60.74   11126850    0.2551
## 8  237577678        0    477.14   63134324     66.01   30783340    0.2660
## 5  230076772        1    460.85   51758620     73.83   43992549    0.2880
## 4  436549314        1    482.68   93082972     76.41   45138596    0.3026
## 3  579119955        1    478.96  141725015     73.48   48863408    0.2937
## 2  306201498        1    471.92   62514644     81.64   63647825    0.3044
##   XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume Gold.Price
## 9   16698901                             15     3537.01   4890120000     1872.6
## 8   18763922                             15     3585.15   4709670000     1885.7
## 5   23107272                             22     3626.91   5281980000     1887.3
## 4   35535529                             22     3609.53   4799570000     1884.5
## 3   40662719                             22     3567.79   5274450000     1873.5
## 2   34206680                             22     3581.87   4347200000     1861.1
##   Gold.Volume
## 9         220
## 8         240
## 5           6
## 4          59
## 3         152
## 2          59

The data now has 952 data points to work with and 21 features * 952 data points are very few considering the complexity of this task * 21 features provide a sufficient number of options for creating a predictive outcomes * It’ll be difficult to build a strong predictive model given the small data set, randomness of the data, and relatively simple machine learning models.

#Removing blanks
master= master[!(master$Gold.Price ==0 & master$Gold.Volume ==0),]

dim(master)
## [1] 952  20

Data Modeling

summary(master)
##    Timestamp              Date                 Open              High      
##  Min.   :1.486e+09   Min.   :2017-01-30   Min.   :  917.3   Min.   :  924  
##  1st Qu.:1.516e+09   1st Qu.:2018-01-10   1st Qu.: 4208.0   1st Qu.: 4361  
##  Median :1.546e+09   Median :2018-12-27   Median : 7357.7   Median : 7561  
##  Mean   :1.546e+09   Mean   :2018-12-26   Mean   : 7222.7   Mean   : 7443  
##  3rd Qu.:1.576e+09   3rd Qu.:2019-12-10   3rd Qu.: 9527.5   3rd Qu.: 9733  
##  Max.   :1.606e+09   Max.   :2020-11-19   Max.   :19379.0   Max.   :19651  
##       Low              Close           Volume.BTC       Volume.USD       
##  Min.   :  914.7   Min.   :  923.5   Min.   :     0   Min.   :0.000e+00  
##  1st Qu.: 4047.4   1st Qu.: 4213.8   1st Qu.:  8114   1st Qu.:4.264e+07  
##  Median : 7127.2   Median : 7350.8   Median : 12052   Median :8.201e+07  
##  Mean   : 6974.9   Mean   : 7239.9   Mean   : 15036   Mean   :1.167e+08  
##  3rd Qu.: 9229.2   3rd Qu.: 9523.4   3rd Qu.: 17937   3rd Qu.:1.427e+08  
##  Max.   :18200.0   Max.   :19039.0   Max.   :117495   Max.   :1.238e+09  
##     HL.Close        ETH.Price         ETH.Volume          LTC.Price     
##  Min.   :0.0000   Min.   :  10.59   Min.   :        0   Min.   :  3.76  
##  1st Qu.:0.0000   1st Qu.: 164.73   1st Qu.: 15376424   1st Qu.: 43.58  
##  Median :1.0000   Median : 227.47   Median : 31965666   Median : 55.84  
##  Mean   :0.5483   Mean   : 292.97   Mean   : 50464237   Mean   : 70.43  
##  3rd Qu.:1.0000   3rd Qu.: 368.11   3rd Qu.: 58655071   3rd Qu.: 79.82  
##  Max.   :1.0000   Max.   :1290.01   Max.   :736027536   Max.   :359.40  
##    LTC.Volume          XRP.Price         XRP.Volume       
##  Min.   :0.000e+00   Min.   :0.00539   Min.   :     2697  
##  1st Qu.:5.991e+06   1st Qu.:0.20945   1st Qu.:  4118799  
##  Median :1.195e+07   Median :0.27870   Median :  8967614  
##  Mean   :2.805e+07   Mean   :0.35137   Mean   : 15765896  
##  3rd Qu.:2.461e+07   3rd Qu.:0.38880   3rd Qu.: 16934925  
##  Max.   :1.082e+09   Max.   :2.73000   Max.   :335937893  
##  search$Google_Search_Frequency  SP500.Price    SP500.Volume      
##  Min.   :  4.00                 Min.   :2237   Min.   :1.969e+09  
##  1st Qu.:  9.00                 1st Qu.:2596   1st Qu.:3.271e+09  
##  Median : 11.00                 Median :2795   Median :3.598e+09  
##  Mean   : 14.51                 Mean   :2819   Mean   :3.880e+09  
##  3rd Qu.: 14.25                 3rd Qu.:2992   3rd Qu.:4.083e+09  
##  Max.   :100.00                 Max.   :3627   Max.   :9.045e+09  
##    Gold.Price    Gold.Volume    
##  Min.   :1176   Min.   :     0  
##  1st Qu.:1265   1st Qu.:    28  
##  Median :1313   Median :   135  
##  Mean   :1416   Mean   :  6087  
##  3rd Qu.:1512   3rd Qu.:   474  
##  Max.   :2052   Max.   :386334

We can see through the min and max of the price variables, such as Ethereum’s min of 10.59 and max of 1290, the rapid growth and volatility of cryptocurrency market. Specifically, when cryptocurrency is compared to traditional asset classes, such the S&P500, which had a min of 2237 and max of 3627, and Gold, which has a min of 1176 and max of 2052, over the same period.

Display Histograms

hist(master$Volume.BTC, col="blue")

hist(master$Volume.USD, col="blue")

hist(master$Close, col="blue")

## Display Scatter Plots * Display the Close Price over the time,this confirms how volatile Bitcoin prices are

# scatter plots of the data
plot(master$Date, master$Close,pch=20,col="red")

head(master)
##       Timestamp       Date    Open    High     Low   Close Volume.BTC
## 1391 1485734400 2017-01-30  917.31  923.95  914.69  923.45    3678.36
## 1390 1485820800 2017-01-31  923.45  971.24  922.83  970.92    6624.94
## 1389 1485907200 2017-02-01  970.92  991.38  963.84  989.71    5983.96
## 1388 1485993600 2017-02-02  989.71 1010.00  978.74 1007.66    5623.69
## 1387 1486080000 2017-02-03 1007.66 1024.50  994.34 1016.77    6731.61
## 1384 1486339200 2017-02-06 1019.31 1027.70 1014.64 1024.39    4227.33
##      Volume.USD HL.Close ETH.Price ETH.Volume LTC.Price LTC.Volume XRP.Price
## 1391    3385239        1     10.59   437746.5      4.03   23569.63   0.00645
## 1390    6298154        1     10.74   413350.2      4.07   35332.93   0.00641
## 1389    5835317        1     10.73   630953.7      4.08   17621.75   0.00649
## 1388    5602317        1     10.82   513774.8      4.09   17753.63   0.00640
## 1387    6815466        1     10.95   531755.4      4.06   15202.71   0.00638
## 1384    4321741        1     11.34   509447.5      3.97    6224.56   0.00638
##      XRP.Volume search$Google_Search_Frequency SP500.Price SP500.Volume
## 1391    3249.53                              4     2280.90   3591270000
## 1390   13926.48                              4     2278.87   4087450000
## 1389   13118.79                              4     2279.55   3916610000
## 1388   13887.87                              4     2280.85   3807710000
## 1387   12139.60                              4     2297.42   3597970000
## 1384    2697.23                              4     2292.56   3109050000
##      Gold.Price Gold.Volume
## 1391     1193.2       50503
## 1390     1208.6        3212
## 1389     1205.6        1145
## 1388     1216.7        1512
## 1387     1218.5         865
## 1384     1230.0         908

Plots - If Y varaiable was Close ($)

master$Gold.Price<- as.double(master$Gold.Price)
master$Gold.Volume<- as.double(master$Gold.Volume)
plot( master$Volume.USD,master$Close,pch=20,col="red")

plot( master$Volume.BTC,master$Close,pch=20,col="red")

plot( master$ETH.Price,master$Close,pch=20,col="red")
plot( master$ETH.Price,master$Close,pch=20,col="red")

plot( master$ETH.Volume,master$Close,pch=20,col="red")

plot( master$LTC.Price,master$Close,pch=20,col="red")

plot( master$LTC.Volume,master$Close,pch=20,col="red")

plot( master$XRP.Price,master$Close,pch=20,col="red")

plot( master$XRP.Volume,master$Close,pch=20,col="red")

plot( master$SP500.Price,master$Close,pch=20,col="red")

plot( master$SP500.Volume,master$Close,pch=20,col="red")

plot( master$Gold.Price,master$Close,pch=20,col="red")

plot( master$Gold.Volume,master$Close,pch=20,col="red")

#plot( master$HL.Close,master$Close,pch=20,col="red")

Cleaning the master

#move y variable (HL.close)to the last index
##dont rerun this chunk or the indexing will get messed up
#master <- master[,c(1,2,3,4,5,6,7,8,10,11,12,13,14,15,16,17,18,19,20,9)]

names(master)
##  [1] "Timestamp"                      "Date"                          
##  [3] "Open"                           "High"                          
##  [5] "Low"                            "Close"                         
##  [7] "Volume.BTC"                     "Volume.USD"                    
##  [9] "HL.Close"                       "ETH.Price"                     
## [11] "ETH.Volume"                     "LTC.Price"                     
## [13] "LTC.Volume"                     "XRP.Price"                     
## [15] "XRP.Volume"                     "search$Google_Search_Frequency"
## [17] "SP500.Price"                    "SP500.Volume"                  
## [19] "Gold.Price"                     "Gold.Volume"
#To convert all values to integers
#cor(as.numeric(RETS), as.numeric(RETS) -> correl

#Correlation matrix cannot have categorical variables and must be numeric
sapply(master,class)
##                      Timestamp                           Date 
##                      "integer"                         "Date" 
##                           Open                           High 
##                      "numeric"                      "numeric" 
##                            Low                          Close 
##                      "numeric"                      "numeric" 
##                     Volume.BTC                     Volume.USD 
##                      "numeric"                      "numeric" 
##                       HL.Close                      ETH.Price 
##                      "numeric"                      "numeric" 
##                     ETH.Volume                      LTC.Price 
##                      "numeric"                      "numeric" 
##                     LTC.Volume                      XRP.Price 
##                      "numeric"                      "numeric" 
##                     XRP.Volume search$Google_Search_Frequency 
##                      "numeric"                      "numeric" 
##                    SP500.Price                   SP500.Volume 
##                      "numeric"                      "numeric" 
##                     Gold.Price                    Gold.Volume 
##                      "numeric"                      "numeric"
# Rename 
names(master)[names(master) == "search$Google_Search_Frequency"] <- "Google.Search"

#head(master)

Correlation Master Data Set

#Remove Timestamp, date 
master_cor = subset(master, select = -c(Timestamp, Date))

master= master_cor

head(master_cor)
##         Open    High     Low   Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391  917.31  923.95  914.69  923.45    3678.36    3385239        1     10.59
## 1390  923.45  971.24  922.83  970.92    6624.94    6298154        1     10.74
## 1389  970.92  991.38  963.84  989.71    5983.96    5835317        1     10.73
## 1388  989.71 1010.00  978.74 1007.66    5623.69    5602317        1     10.82
## 1387 1007.66 1024.50  994.34 1016.77    6731.61    6815466        1     10.95
## 1384 1019.31 1027.70 1014.64 1024.39    4227.33    4321741        1     11.34
##      ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391   437746.5      4.03   23569.63   0.00645    3249.53             4
## 1390   413350.2      4.07   35332.93   0.00641   13926.48             4
## 1389   630953.7      4.08   17621.75   0.00649   13118.79             4
## 1388   513774.8      4.09   17753.63   0.00640   13887.87             4
## 1387   531755.4      4.06   15202.71   0.00638   12139.60             4
## 1384   509447.5      3.97    6224.56   0.00638    2697.23             4
##      SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391     2280.90   3591270000     1193.2       50503
## 1390     2278.87   4087450000     1208.6        3212
## 1389     2279.55   3916610000     1205.6        1145
## 1388     2280.85   3807710000     1216.7        1512
## 1387     2297.42   3597970000     1218.5         865
## 1384     2292.56   3109050000     1230.0         908

Correlation Matrix with HL.Close

Removed HL.Close since it is our Y variable

x=round(cor(master_cor[ ,1:17]),2)
library("corrplot")
## corrplot 0.84 loaded
cor(master_cor[ ,1:17])
##                      Open        High         Low       Close  Volume.BTC
## Open           1.00000000  0.99614666  0.99385953  0.99398782  0.21378860
## High           0.99614666  1.00000000  0.99160853  0.99728517  0.25297738
## Low            0.99385953  0.99160853  1.00000000  0.99597589  0.15193856
## Close          0.99398782  0.99728517  0.99597589  1.00000000  0.21098044
## Volume.BTC     0.21378860  0.25297738  0.15193856  0.21098044  1.00000000
## Volume.USD     0.62722020  0.66322474  0.57190725  0.62752404  0.81984498
## HL.Close      -0.04916218 -0.04944641 -0.05103671 -0.05279288  0.02424055
## ETH.Price      0.57120233  0.57731423  0.55002907  0.56566335  0.21407297
## ETH.Volume     0.39517796  0.40934477  0.33473403  0.37747912  0.58759126
## LTC.Price      0.61142077  0.62015225  0.58566120  0.60656188  0.23692178
## LTC.Volume     0.38258455  0.39652530  0.33444569  0.37431167  0.42595723
## XRP.Price      0.43021218  0.43589057  0.40887769  0.42424650  0.12991861
## XRP.Volume     0.45826224  0.46490252  0.42030343  0.44555048  0.34327036
## Google.Search  0.48658082  0.51339231  0.44401155  0.48907791  0.50178696
## SP500.Price    0.69859731  0.68155501  0.72429409  0.69906597 -0.10012093
## SP500.Volume   0.15383423  0.15449643  0.15617054  0.15441583  0.23529218
## Gold.Price     0.57359361  0.56075600  0.59683323  0.57662753  0.02599601
##                 Volume.USD      HL.Close   ETH.Price   ETH.Volume   LTC.Price
## Open           0.627220202 -0.0491621820  0.57120233  0.395177961  0.61142077
## High           0.663224739 -0.0494464119  0.57731423  0.409344767  0.62015225
## Low            0.571907249 -0.0510367083  0.55002907  0.334734029  0.58566120
## Close          0.627524040 -0.0527928772  0.56566335  0.377479124  0.60656188
## Volume.BTC     0.819844983  0.0242405549  0.21407297  0.587591264  0.23692178
## Volume.USD     1.000000000 -0.0046017805  0.42877175  0.647834573  0.50569552
## HL.Close      -0.004601781  1.0000000000 -0.02680736  0.007663671 -0.03813247
## ETH.Price      0.428771749 -0.0268073573  1.00000000  0.666252503  0.83472892
## ETH.Volume     0.647834573  0.0076636710  0.66625250  1.000000000  0.58993239
## LTC.Price      0.505695524 -0.0381324677  0.83472892  0.589932385  1.00000000
## LTC.Volume     0.603150494 -0.0049336060  0.47356322  0.669756933  0.68390439
## XRP.Price      0.301901750 -0.0112122870  0.81737650  0.574323310  0.80081994
## XRP.Volume     0.487882284  0.0435668579  0.51665623  0.695489155  0.52908491
## Google.Search  0.699667142  0.0148804734  0.57148617  0.659257829  0.67084997
## SP500.Price    0.200549244 -0.0368106151  0.09592880 -0.022683036  0.05719449
## SP500.Volume   0.168247659  0.0307668933 -0.10137879  0.027693148 -0.16750611
## Gold.Price     0.217613931 -0.0003576898  0.01365688 -0.004235498 -0.12856161
##                 LTC.Volume    XRP.Price XRP.Volume Google.Search  SP500.Price
## Open           0.382584546  0.430212180 0.45826224    0.48658082  0.698597308
## High           0.396525305  0.435890572 0.46490252    0.51339231  0.681555010
## Low            0.334445688  0.408877693 0.42030343    0.44401155  0.724294088
## Close          0.374311669  0.424246499 0.44555048    0.48907791  0.699065970
## Volume.BTC     0.425957234  0.129918612 0.34327036    0.50178696 -0.100120933
## Volume.USD     0.603150494  0.301901750 0.48788228    0.69966714  0.200549244
## HL.Close      -0.004933606 -0.011212287 0.04356686    0.01488047 -0.036810615
## ETH.Price      0.473563215  0.817376505 0.51665623    0.57148617  0.095928802
## ETH.Volume     0.669756933  0.574323310 0.69548915    0.65925783 -0.022683036
## LTC.Price      0.683904393  0.800819943 0.52908491    0.67084997  0.057194488
## LTC.Volume     1.000000000  0.377375611 0.44231315    0.74015179 -0.090281630
## XRP.Price      0.377375611  1.000000000 0.69792442    0.45554421  0.002754101
## XRP.Volume     0.442313148  0.697924425 1.00000000    0.52513982  0.103723173
## Google.Search  0.740151787  0.455544206 0.52513982    1.00000000 -0.091750110
## SP500.Price   -0.090281630  0.002754101 0.10372317   -0.09175011  1.000000000
## SP500.Volume  -0.074001768 -0.141778731 0.13827935   -0.02174655  0.154385820
## Gold.Price    -0.132099033 -0.183135446 0.11793121   -0.08913261  0.764878470
##               SP500.Volume    Gold.Price
## Open            0.15383423  0.5735936055
## High            0.15449643  0.5607560029
## Low             0.15617054  0.5968332252
## Close           0.15441583  0.5766275250
## Volume.BTC      0.23529218  0.0259960126
## Volume.USD      0.16824766  0.2176139306
## HL.Close        0.03076689 -0.0003576898
## ETH.Price      -0.10137879  0.0136568846
## ETH.Volume      0.02769315 -0.0042354975
## LTC.Price      -0.16750611 -0.1285616073
## LTC.Volume     -0.07400177 -0.1320990330
## XRP.Price      -0.14177873 -0.1831354460
## XRP.Volume      0.13827935  0.1179312139
## Google.Search  -0.02174655 -0.0891326080
## SP500.Price     0.15438582  0.7648784698
## SP500.Volume    1.00000000  0.4536416658
## Gold.Price      0.45364167  1.0000000000
library("Hmisc")
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(RColorBrewer)

corrplot((cor(master_cor[ ,1:17])), method = "number")

M <-cor(master_cor)
corrplot(M, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

## Correlation Matrix without HL.Close

master_cor2 = subset(master_cor, select = -c(HL.Close))

x=round(cor(master_cor2[ ,1:17]),2)
library("corrplot")

cor(master_cor2[ ,1:17])
##                      Open        High         Low       Close   Volume.BTC
## Open           1.00000000  0.99614666  0.99385953  0.99398782  0.213788601
## High           0.99614666  1.00000000  0.99160853  0.99728517  0.252977377
## Low            0.99385953  0.99160853  1.00000000  0.99597589  0.151938564
## Close          0.99398782  0.99728517  0.99597589  1.00000000  0.210980441
## Volume.BTC     0.21378860  0.25297738  0.15193856  0.21098044  1.000000000
## Volume.USD     0.62722020  0.66322474  0.57190725  0.62752404  0.819844983
## ETH.Price      0.57120233  0.57731423  0.55002907  0.56566335  0.214072969
## ETH.Volume     0.39517796  0.40934477  0.33473403  0.37747912  0.587591264
## LTC.Price      0.61142077  0.62015225  0.58566120  0.60656188  0.236921785
## LTC.Volume     0.38258455  0.39652530  0.33444569  0.37431167  0.425957234
## XRP.Price      0.43021218  0.43589057  0.40887769  0.42424650  0.129918612
## XRP.Volume     0.45826224  0.46490252  0.42030343  0.44555048  0.343270365
## Google.Search  0.48658082  0.51339231  0.44401155  0.48907791  0.501786959
## SP500.Price    0.69859731  0.68155501  0.72429409  0.69906597 -0.100120933
## SP500.Volume   0.15383423  0.15449643  0.15617054  0.15441583  0.235292179
## Gold.Price     0.57359361  0.56075600  0.59683323  0.57662753  0.025996013
## Gold.Volume   -0.02323081 -0.02153901 -0.02183498 -0.02167915 -0.003082816
##                Volume.USD   ETH.Price   ETH.Volume    LTC.Price  LTC.Volume
## Open           0.62722020  0.57120233  0.395177961  0.611420771  0.38258455
## High           0.66322474  0.57731423  0.409344767  0.620152247  0.39652530
## Low            0.57190725  0.55002907  0.334734029  0.585661205  0.33444569
## Close          0.62752404  0.56566335  0.377479124  0.606561882  0.37431167
## Volume.BTC     0.81984498  0.21407297  0.587591264  0.236921785  0.42595723
## Volume.USD     1.00000000  0.42877175  0.647834573  0.505695524  0.60315049
## ETH.Price      0.42877175  1.00000000  0.666252503  0.834728924  0.47356322
## ETH.Volume     0.64783457  0.66625250  1.000000000  0.589932385  0.66975693
## LTC.Price      0.50569552  0.83472892  0.589932385  1.000000000  0.68390439
## LTC.Volume     0.60315049  0.47356322  0.669756933  0.683904393  1.00000000
## XRP.Price      0.30190175  0.81737650  0.574323310  0.800819943  0.37737561
## XRP.Volume     0.48788228  0.51665623  0.695489155  0.529084912  0.44231315
## Google.Search  0.69966714  0.57148617  0.659257829  0.670849973  0.74015179
## SP500.Price    0.20054924  0.09592880 -0.022683036  0.057194488 -0.09028163
## SP500.Volume   0.16824766 -0.10137879  0.027693148 -0.167506108 -0.07400177
## Gold.Price     0.21761393  0.01365688 -0.004235498 -0.128561607 -0.13209903
## Gold.Volume   -0.02005472  0.01169730 -0.012074959  0.004945084 -0.02073302
##                  XRP.Price  XRP.Volume Google.Search  SP500.Price SP500.Volume
## Open           0.430212180  0.45826224    0.48658082  0.698597308   0.15383423
## High           0.435890572  0.46490252    0.51339231  0.681555010   0.15449643
## Low            0.408877693  0.42030343    0.44401155  0.724294088   0.15617054
## Close          0.424246499  0.44555048    0.48907791  0.699065970   0.15441583
## Volume.BTC     0.129918612  0.34327036    0.50178696 -0.100120933   0.23529218
## Volume.USD     0.301901750  0.48788228    0.69966714  0.200549244   0.16824766
## ETH.Price      0.817376505  0.51665623    0.57148617  0.095928802  -0.10137879
## ETH.Volume     0.574323310  0.69548915    0.65925783 -0.022683036   0.02769315
## LTC.Price      0.800819943  0.52908491    0.67084997  0.057194488  -0.16750611
## LTC.Volume     0.377375611  0.44231315    0.74015179 -0.090281630  -0.07400177
## XRP.Price      1.000000000  0.69792442    0.45554421  0.002754101  -0.14177873
## XRP.Volume     0.697924425  1.00000000    0.52513982  0.103723173   0.13827935
## Google.Search  0.455544206  0.52513982    1.00000000 -0.091750110  -0.02174655
## SP500.Price    0.002754101  0.10372317   -0.09175011  1.000000000   0.15438582
## SP500.Volume  -0.141778731  0.13827935   -0.02174655  0.154385820   1.00000000
## Gold.Price    -0.183135446  0.11793121   -0.08913261  0.764878470   0.45364167
## Gold.Volume    0.009514297 -0.01548651    0.02136058 -0.019322232  -0.01759803
##                 Gold.Price  Gold.Volume
## Open           0.573593605 -0.023230807
## High           0.560756003 -0.021539010
## Low            0.596833225 -0.021834979
## Close          0.576627525 -0.021679153
## Volume.BTC     0.025996013 -0.003082816
## Volume.USD     0.217613931 -0.020054724
## ETH.Price      0.013656885  0.011697303
## ETH.Volume    -0.004235498 -0.012074959
## LTC.Price     -0.128561607  0.004945084
## LTC.Volume    -0.132099033 -0.020733019
## XRP.Price     -0.183135446  0.009514297
## XRP.Volume     0.117931214 -0.015486512
## Google.Search -0.089132608  0.021360583
## SP500.Price    0.764878470 -0.019322232
## SP500.Volume   0.453641666 -0.017598033
## Gold.Price     1.000000000 -0.031549672
## Gold.Volume   -0.031549672  1.000000000
library("Hmisc")

library(RColorBrewer)

corrplot((cor(master_cor2[ ,1:17])), method = "number")

M <-cor(master_cor2)
corrplot(M, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

## Plot Part 2 Y value = HL.Close

plot( master_cor2$Volume.USD,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$Volume.BTC,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$ETH.Price,master_cor$HL.Close,pch=20,col="red")
plot( master_cor2$ETH.Price,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$ETH.Volume,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$LTC.Price,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$LTC.Volume,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$XRP.Price,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$XRP.Volume,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$SP500.Price,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$SP500.Volume,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$Gold.Price,master_cor$HL.Close,pch=20,col="red")

plot( master_cor2$Gold.Volume,master_cor$HL.Close,pch=20,col="red")

### Remove highly correlated variables Typically you would want to remove variables that are highly correlated (0.4+) to avoid multicollinerity However, as shown in the correlation charts, variables are very highly correlated to each other *To ensure that we have enough data points in our model we will be use 0.76 as the correlation cutoff point

library(caret)
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
df2 = cor(master_cor)
hc = findCorrelation(df2, cutoff=0.76) # putt any value as a "cutoff" 
hc = sort(hc)
master_reduced = master_cor[,-c(hc)]
head (master_reduced)
##          Low Volume.BTC HL.Close ETH.Volume LTC.Volume XRP.Price XRP.Volume
## 1391  914.69    3678.36        1   437746.5   23569.63   0.00645    3249.53
## 1390  922.83    6624.94        1   413350.2   35332.93   0.00641   13926.48
## 1389  963.84    5983.96        1   630953.7   17621.75   0.00649   13118.79
## 1388  978.74    5623.69        1   513774.8   17753.63   0.00640   13887.87
## 1387  994.34    6731.61        1   531755.4   15202.71   0.00638   12139.60
## 1384 1014.64    4227.33        1   509447.5    6224.56   0.00638    2697.23
##      Google.Search SP500.Price SP500.Volume Gold.Volume
## 1391             4     2280.90   3591270000       50503
## 1390             4     2278.87   4087450000        3212
## 1389             4     2279.55   3916610000        1145
## 1388             4     2280.85   3807710000        1512
## 1387             4     2297.42   3597970000         865
## 1384             4     2292.56   3109050000         908

Regression

*Regression with the eliminated correlation variables

master_reg = master_reduced
# Create Training and Testing Sets
head(master_reg)
##          Low Volume.BTC HL.Close ETH.Volume LTC.Volume XRP.Price XRP.Volume
## 1391  914.69    3678.36        1   437746.5   23569.63   0.00645    3249.53
## 1390  922.83    6624.94        1   413350.2   35332.93   0.00641   13926.48
## 1389  963.84    5983.96        1   630953.7   17621.75   0.00649   13118.79
## 1388  978.74    5623.69        1   513774.8   17753.63   0.00640   13887.87
## 1387  994.34    6731.61        1   531755.4   15202.71   0.00638   12139.60
## 1384 1014.64    4227.33        1   509447.5    6224.56   0.00638    2697.23
##      Google.Search SP500.Price SP500.Volume Gold.Volume
## 1391             4     2280.90   3591270000       50503
## 1390             4     2278.87   4087450000        3212
## 1389             4     2279.55   3916610000        1145
## 1388             4     2280.85   3807710000        1512
## 1387             4     2297.42   3597970000         865
## 1384             4     2292.56   3109050000         908
num_samples = dim(master_reg)[1]
sampling.rate = 0.8
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master[testing, ]

Regression 1

Create regression model, Logistic Regression with the removed correlated variables

# Create Regression Model
LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume + XRP.Price + XRP.Volume + Google.Search + SP500.Price+ SP500.Volume+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
## 
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume + 
##     XRP.Price + XRP.Volume + Google.Search + SP500.Price + SP500.Volume + 
##     Gold.Volume, family = binomial(logit), data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0236  -1.2504   0.9491   1.0757   1.4711  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)    5.164e-02  1.398e+00   0.037    0.971  
## Low           -8.809e-05  5.603e-05  -1.572    0.116  
## Volume.BTC    -1.502e-06  8.767e-06  -0.171    0.864  
## ETH.Volume    -2.818e-09  2.113e-09  -1.333    0.182  
## LTC.Volume    -1.020e-09  1.879e-09  -0.543    0.587  
## XRP.Price     -4.369e-01  4.502e-01  -0.970    0.332  
## XRP.Volume     1.235e-08  5.309e-09   2.327    0.020 *
## Google.Search  1.704e-02  1.172e-02   1.454    0.146  
## SP500.Price    2.920e-04  5.331e-04   0.548    0.584  
## SP500.Volume  -3.462e-11  8.527e-11  -0.406    0.685  
## Gold.Volume   -7.549e-09  2.320e-06  -0.003    0.997  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1032.6  on 750  degrees of freedom
## AIC: 1054.6
## 
## Number of Fisher Scoring iterations: 4

Regression 2

Removal of SP500.Volume Eliminate insignificant variables one by one, starting with the variable with the highest P value

LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume + XRP.Price + XRP.Volume + Google.Search + SP500.Price+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
## 
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + LTC.Volume + 
##     XRP.Price + XRP.Volume + Google.Search + SP500.Price + Gold.Volume, 
##     family = binomial(logit), data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9968  -1.2469   0.9526   1.0795   1.4595  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -1.604e-01  1.296e+00  -0.124   0.9015  
## Low           -9.385e-05  5.420e-05  -1.731   0.0834 .
## Volume.BTC    -2.423e-06  8.475e-06  -0.286   0.7750  
## ETH.Volume    -2.757e-09  2.102e-09  -1.312   0.1896  
## LTC.Volume    -9.179e-10  1.850e-09  -0.496   0.6197  
## XRP.Price     -3.772e-01  4.251e-01  -0.887   0.3749  
## XRP.Volume     1.166e-08  4.988e-09   2.339   0.0194 *
## Google.Search  1.766e-02  1.161e-02   1.521   0.1282  
## SP500.Price    3.300e-04  5.247e-04   0.629   0.5294  
## Gold.Volume    5.213e-09  2.318e-06   0.002   0.9982  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1032.8  on 751  degrees of freedom
## AIC: 1052.8
## 
## Number of Fisher Scoring iterations: 4

Regression 3

Removal of LTC.Volume

LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Price + XRP.Volume + Google.Search + SP500.Price+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
## 
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Price + 
##     XRP.Volume + Google.Search + SP500.Price + Gold.Volume, family = binomial(logit), 
##     data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9609  -1.2472   0.9498   1.0798   1.3882  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -2.271e-01  1.290e+00  -0.176   0.8603  
## Low           -9.755e-05  5.376e-05  -1.814   0.0696 .
## Volume.BTC    -1.949e-06  8.405e-06  -0.232   0.8167  
## ETH.Volume    -3.151e-09  1.946e-09  -1.619   0.1054  
## XRP.Price     -3.532e-01  4.210e-01  -0.839   0.4016  
## XRP.Volume     1.190e-08  4.965e-09   2.397   0.0165 *
## Google.Search  1.512e-02  1.032e-02   1.465   0.1429  
## SP500.Price    3.668e-04  5.201e-04   0.705   0.4807  
## Gold.Volume    4.404e-08  2.317e-06   0.019   0.9848  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1033.0  on 752  degrees of freedom
## AIC: 1051
## 
## Number of Fisher Scoring iterations: 4

Regression 4

Removal of XRP.Price

LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume + Google.Search + SP500.Price+ Gold.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
## 
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume + 
##     Google.Search + SP500.Price + Gold.Volume, family = binomial(logit), 
##     data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9167  -1.2469   0.9708   1.0800   1.4799  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -7.037e-01  1.158e+00  -0.608   0.5433  
## Low           -1.154e-04  4.940e-05  -2.337   0.0194 *
## Volume.BTC     3.866e-07  7.951e-06   0.049   0.9612  
## ETH.Volume    -3.562e-09  1.889e-09  -1.886   0.0593 .
## XRP.Volume     1.004e-08  4.457e-09   2.252   0.0243 *
## Google.Search  1.639e-02  1.019e-02   1.608   0.1078  
## SP500.Price    5.356e-04  4.796e-04   1.117   0.2641  
## Gold.Volume   -7.138e-08  2.313e-06  -0.031   0.9754  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1033.8  on 753  degrees of freedom
## AIC: 1049.8
## 
## Number of Fisher Scoring iterations: 4

Regression 5

Removal of Gold.Volume

LogisticReg <- glm(HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume + Google.Search + SP500.Price, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
## 
## Call:
## glm(formula = HL.Close ~ Low + Volume.BTC + ETH.Volume + XRP.Volume + 
##     Google.Search + SP500.Price, family = binomial(logit), data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9165  -1.2467   0.9709   1.0778   1.4801  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -7.048e-01  1.157e+00  -0.609   0.5425  
## Low           -1.154e-04  4.940e-05  -2.337   0.0195 *
## Volume.BTC     3.856e-07  7.951e-06   0.049   0.9613  
## ETH.Volume    -3.562e-09  1.889e-09  -1.886   0.0593 .
## XRP.Volume     1.004e-08  4.457e-09   2.252   0.0243 *
## Google.Search  1.638e-02  1.019e-02   1.608   0.1079  
## SP500.Price    5.358e-04  4.796e-04   1.117   0.2639  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1033.8  on 754  degrees of freedom
## AIC: 1047.8
## 
## Number of Fisher Scoring iterations: 4

Regression 6

Removal of BTC.Volume * This will be last iteration of the regression_v1. Although the P value for every variable is except “Low” and “XRP.Volume” is higher than the benchmark of 0.05. * This indicates most of the variables we have used are statically insignificant * If we were to extend this analysis, we would look into ANOVA and Lower and Upper 95% to get a stronger understanding about the coefficients/variables

LogisticReg <- glm(HL.Close ~ Low + ETH.Volume + XRP.Volume + Google.Search + SP500.Price, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg)
## 
## Call:
## glm(formula = HL.Close ~ Low + ETH.Volume + XRP.Volume + Google.Search + 
##     SP500.Price, family = binomial(logit), data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9186  -1.2474   0.9711   1.0781   1.4741  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -7.013e-01  1.155e+00  -0.607   0.5437  
## Low           -1.156e-04  4.928e-05  -2.346   0.0190 *
## ETH.Volume    -3.528e-09  1.755e-09  -2.011   0.0443 *
## XRP.Volume     1.003e-08  4.453e-09   2.251   0.0244 *
## Google.Search  1.647e-02  1.004e-02   1.640   0.1010  
## SP500.Price    5.361e-04  4.795e-04   1.118   0.2635  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1033.8  on 755  degrees of freedom
## AIC: 1045.8
## 
## Number of Fisher Scoring iterations: 4

Version 2 of Regression

Only include stastically signifcant varaibles ( Low and XRP.Volume)

LogisticReg_v2 <- glm(HL.Close ~ Low + XRP.Volume, data = trainingSet, family = binomial(logit))
#get summary statistics
summary(LogisticReg_v2)
## 
## Call:
## glm(formula = HL.Close ~ Low + XRP.Volume, family = binomial(logit), 
##     data = trainingSet)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6534  -1.2453   0.9809   1.0932   1.3124  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  5.489e-01  1.701e-01   3.226  0.00126 **
## Low         -5.929e-05  2.395e-05  -2.476  0.01329 * 
## XRP.Volume   5.195e-09  3.090e-09   1.681  0.09271 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1045.9  on 760  degrees of freedom
## Residual deviance: 1039.0  on 758  degrees of freedom
## AIC: 1045
## 
## Number of Fisher Scoring iterations: 4

Predictions of Regression V1

# Perform prdictions for the testing set
predictions <-predict(LogisticReg, testingSet, type = "response")
predictedLabels <- round(predictions)

We compute the misclassification rate regression V1 (the rate of incorrect predictions).

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4764398

Regression of V2 (Only including Low and XRP.Volume)

# Perform prdictions for the testing set
predictions <-predict(LogisticReg_v2, testingSet, type = "response")

The predict function returns continuous values between 0 and 1. We need to convert these values to the discrete 0/1 classes

predictedLabels <- round(predictions)

Plot the actual vs predicted values (for the testing set)

  • We compute the misclassification rate RegressionV2
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4659686

Decision Tree

master_tree = master
master_tree$HL.Close = factor(master_tree$HL.Close, levels=c(0,1), labels = c("L", "H"))
levels((master_tree$HL.Close))
## [1] "L" "H"

Create Training and Testing sets (Note that this data set is small so let us keep 90% for training)

# Create Training and Testing Sets
num_samples = dim(master_tree)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_tree[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_tree[testing, ]

Fit a decision tree to predict rating using all the other variables.

library(rpart)
#Fit a decision tree model using the training data
decTreeModel <- rpart(HL.Close ~ .,data=trainingSet, method = "class")

Display the tree

plot(decTreeModel, margin=0.1)
text(decTreeModel)

library(rpart.plot)
rpart.plot(decTreeModel)

Tune the size of the tree to avoid overfitting

plotcp(decTreeModel)

Prune the tree at a cp = 0.018 * Check if this right

pruned_decTreeModel = prune(decTreeModel, cp=0.018)
# Display pruned tree
plot(pruned_decTreeModel, margin=0.1)
text(pruned_decTreeModel)

rpart.plot(pruned_decTreeModel)

Evaluate the decision tree model using the testing set

# Perform prdictions for the testing set
predictedLabels<-predict(pruned_decTreeModel, testingSet, type = "class")
print(predictedLabels)
## 1373 1367 1366 1349 1347 1342 1321 1299 1248 1228 1222 1201 1199 1179 1171 1121 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
## 1117 1111 1054 1040 1031  985  983  954  922  921  914  906  899  883  873  866 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    L 
##  855  838  837  821  817  813  801  800  794  789  778  764  753  752  726  705 
##    H    H    H    H    H    H    H    H    H    L    L    L    L    L    H    H 
##  703  701  689  676  675  666  619  613  579  561  558  515  512  508  498  451 
##    H    H    H    H    H    H    H    L    H    H    H    H    H    H    H    H 
##  442  428  414  401  396  381  347  344  330  288  270  256  232  207  198  187 
##    H    H    H    L    H    H    H    H    H    H    H    H    H    H    H    H 
##  183  165  162  157  150  145  116  100   99   96   94   22   15   11    3    2 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
## Levels: L H

Show the true labels

print(testingSet$HL.Close)
##  [1] H H L H L H H H H L L L H H L L H H H L L H H L L L L H H H L L H L L H H H
## [39] H L H L L H H H H H H H L H H L H L H H H H L H L L H L L L L L L L H H L H
## [77] H H L H H H H L L L H L H L H H L H H H
## Levels: L H

Decision Tree’s misclassification rate

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.40625

Random Forest

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
#Factors of the Y varaiable 
master_forest = master_tree
head(master_forest)
##         Open    High     Low   Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391  917.31  923.95  914.69  923.45    3678.36    3385239        H     10.59
## 1390  923.45  971.24  922.83  970.92    6624.94    6298154        H     10.74
## 1389  970.92  991.38  963.84  989.71    5983.96    5835317        H     10.73
## 1388  989.71 1010.00  978.74 1007.66    5623.69    5602317        H     10.82
## 1387 1007.66 1024.50  994.34 1016.77    6731.61    6815466        H     10.95
## 1384 1019.31 1027.70 1014.64 1024.39    4227.33    4321741        H     11.34
##      ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391   437746.5      4.03   23569.63   0.00645    3249.53             4
## 1390   413350.2      4.07   35332.93   0.00641   13926.48             4
## 1389   630953.7      4.08   17621.75   0.00649   13118.79             4
## 1388   513774.8      4.09   17753.63   0.00640   13887.87             4
## 1387   531755.4      4.06   15202.71   0.00638   12139.60             4
## 1384   509447.5      3.97    6224.56   0.00638    2697.23             4
##      SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391     2280.90   3591270000     1193.2       50503
## 1390     2278.87   4087450000     1208.6        3212
## 1389     2279.55   3916610000     1205.6        1145
## 1388     2280.85   3807710000     1216.7        1512
## 1387     2297.42   3597970000     1218.5         865
## 1384     2292.56   3109050000     1230.0         908

Create Training and Testing sets

# Create Training and Testing Sets
num_samples = dim(master_forest)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_forest[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_forest[testing, ]

Train a random forest using the training set data

RandForestModel <- randomForest(HL.Close ~ ., data = trainingSet)

Plot the error as a function of the number of trees

Interpret the graph

plot(RandForestModel)
legend("top", colnames(RandForestModel$err.rate),fill=1:3)

# Perform predictions for the testing set
predictedLabels<-predict(RandForestModel, testingSet)

Random Forest Misclassification rate

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4166667

Rank Features By Importance

# ensure results are repeatable
set.seed(7)
# load the library
library(mlbench)
library(caret)
# load the dataset
data(master_tree)
## Warning in data(master_tree): data set 'master_tree' not found
# prepare training scheme
control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
model <- train(HL.Close~., data=master_tree, method="lvq", preProcess="scale", trControl=control)
# estimate variable importance
importance <- varImp(model, scale=FALSE)
# summarize importance
print(importance)
## ROC curve variable importance
## 
##               Importance
## LTC.Price         0.5359
## Close             0.5357
## Gold.Volume       0.5342
## Low               0.5339
## High              0.5314
## XRP.Price         0.5310
## Open              0.5309
## SP500.Volume      0.5308
## Volume.BTC        0.5273
## SP500.Price       0.5266
## ETH.Price         0.5232
## Google.Search     0.5201
## LTC.Volume        0.5095
## Gold.Price        0.5089
## Volume.USD        0.5023
## ETH.Volume        0.5022
## XRP.Volume        0.5006
# plot importance
plot(importance)

# KNN

master_knn = master
head(master_knn)
##         Open    High     Low   Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391  917.31  923.95  914.69  923.45    3678.36    3385239        1     10.59
## 1390  923.45  971.24  922.83  970.92    6624.94    6298154        1     10.74
## 1389  970.92  991.38  963.84  989.71    5983.96    5835317        1     10.73
## 1388  989.71 1010.00  978.74 1007.66    5623.69    5602317        1     10.82
## 1387 1007.66 1024.50  994.34 1016.77    6731.61    6815466        1     10.95
## 1384 1019.31 1027.70 1014.64 1024.39    4227.33    4321741        1     11.34
##      ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391   437746.5      4.03   23569.63   0.00645    3249.53             4
## 1390   413350.2      4.07   35332.93   0.00641   13926.48             4
## 1389   630953.7      4.08   17621.75   0.00649   13118.79             4
## 1388   513774.8      4.09   17753.63   0.00640   13887.87             4
## 1387   531755.4      4.06   15202.71   0.00638   12139.60             4
## 1384   509447.5      3.97    6224.56   0.00638    2697.23             4
##      SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391     2280.90   3591270000     1193.2       50503
## 1390     2278.87   4087450000     1208.6        3212
## 1389     2279.55   3916610000     1205.6        1145
## 1388     2280.85   3807710000     1216.7        1512
## 1387     2297.42   3597970000     1218.5         865
## 1384     2292.56   3109050000     1230.0         908
master_knn <- master_knn[c(7,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18)]
head(master_knn)
##      HL.Close    Open    High     Low   Close Volume.BTC Volume.USD ETH.Price
## 1391        1  917.31  923.95  914.69  923.45    3678.36    3385239     10.59
## 1390        1  923.45  971.24  922.83  970.92    6624.94    6298154     10.74
## 1389        1  970.92  991.38  963.84  989.71    5983.96    5835317     10.73
## 1388        1  989.71 1010.00  978.74 1007.66    5623.69    5602317     10.82
## 1387        1 1007.66 1024.50  994.34 1016.77    6731.61    6815466     10.95
## 1384        1 1019.31 1027.70 1014.64 1024.39    4227.33    4321741     11.34
##      ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391   437746.5      4.03   23569.63   0.00645    3249.53             4
## 1390   413350.2      4.07   35332.93   0.00641   13926.48             4
## 1389   630953.7      4.08   17621.75   0.00649   13118.79             4
## 1388   513774.8      4.09   17753.63   0.00640   13887.87             4
## 1387   531755.4      4.06   15202.71   0.00638   12139.60             4
## 1384   509447.5      3.97    6224.56   0.00638    2697.23             4
##      SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391     2280.90   3591270000     1193.2       50503
## 1390     2278.87   4087450000     1208.6        3212
## 1389     2279.55   3916610000     1205.6        1145
## 1388     2280.85   3807710000     1216.7        1512
## 1387     2297.42   3597970000     1218.5         865
## 1384     2292.56   3109050000     1230.0         908
# Normalize All the Attributes ( NOT HL.CLOSE)
master_knn$Open = (master_knn$Open-mean(master_knn$Open))/sd(master_knn$Open)
master_knn$High = (master_knn$High-mean(master_knn$High))/sd(master_knn$High)
master_knn$Low = (master_knn$Low-mean(master_knn$Low))/sd(master_knn$Low)
master_knn$Close = (master_knn$Close-mean(master_knn$Close))/sd(master_knn$Close)
master_knn$Volume.BTC = (master_knn$Volume.BTC-mean(master_knn$Volume.BTC))/sd(master_knn$Volume.BTC)
master_knn$Volume.USD = (master_knn$Volume.USD-mean(master_knn$Volume.USD))/sd(master_knn$Volume.USD)
master_knn$ETH.Price = (master_knn$ETH.Price-mean(master_knn$ETH.Price))/sd(master_knn$ETH.Price)
master_knn$ETH.Volume = (master_knn$ETH.Volume-mean(master_knn$ETH.Volume))/sd(master_knn$ETH.Volume)
master_knn$LTC.Price = (master_knn$LTC.Price-mean(master_knn$LTC.Price))/sd(master_knn$LTC.Price)
master_knn$LTC.Volume = (master_knn$LTC.Volume-mean(master_knn$LTC.Volume))/sd(master_knn$LTC.Volume)
master_knn$XRP.Price = (master_knn$XRP.Price-mean(master_knn$XRP.Price))/sd(master_knn$XRP.Price)
master_knn$XRP.Volume = (master_knn$XRP.Volume-mean(master_knn$XRP.Volume))/sd(master_knn$XRP.Volume)
master_knn$Google.Search = (master_knn$Google.Search-mean(master_knn$Google.Search))/sd(master_knn$Google.Search)
master_knn$SP500.Price = (master_knn$SP500.Price-mean(master_knn$SP500.Price))/sd(master_knn$SP500.Price)
master_knn$SP500.Volume = (master_knn$SP500.Volume-mean(master_knn$SP500.Volume))/sd(master_knn$SP500.Volume)
master_knn$Gold.Price = (master_knn$Gold.Price-mean(master_knn$Gold.Price))/sd(master_knn$Gold.Price)
master_knn$Gold.Volume = (master_knn$Gold.Volume-mean(master_knn$Gold.Volume))/sd(master_knn$Gold.Volume)

KNN without removed variables

# Create Training and Testing Sets
num_samples = dim(master_knn)[1]
sampling.rate = 0.8
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_knn[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_knn[testing, ]
# Get the features of the training set
trainingfeatures <- subset(trainingSet, select=c(-HL.Close))
# Get the labels of the training set
traininglabels <- trainingSet$HL.Close
# Get the features of the testing set
testingfeatures <- subset(testingSet, select=c(-HL.Close))
# Load the classification library
library(class)
# call KNN with k=3
predictedLabels = knn(trainingfeatures,testingfeatures,traininglabels,k=3)

Display the predicted Labels

head(predictedLabels)
## [1] 1 0 1 1 1 1
## Levels: 0 1

Misclassification rate

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.460733

KNN V2 removed variables using VarImp

master_knnv2= subset(master_knn, select=c(HL.Close,Close,LTC.Price))
head(master_knnv2)
##      HL.Close     Close LTC.Price
## 1391        1 -1.789247 -1.278820
## 1390        1 -1.775800 -1.278050
## 1389        1 -1.770477 -1.277857
## 1388        1 -1.765393 -1.277665
## 1387        1 -1.762812 -1.278242
## 1384        1 -1.760654 -1.279976
# Create Training and Testing Sets
num_samples = dim(master_knnv2)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_knnv2[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_knnv2[testing, ]
# Get the features of the training set
trainingfeatures <- subset(trainingSet, select=c(-HL.Close))
# Get the labels of the training set
traininglabels <- trainingSet$HL.Close
# Get the features of the testing set
testingfeatures <- subset(testingSet, select=c(-HL.Close))
# Load the classification library
library(class)
# call KNN with k=3
predictedLabels = knn(trainingfeatures,testingfeatures,traininglabels,k=3)

Display the predicted Labels

head(predictedLabels)
## [1] 1 1 1 0 1 1
## Levels: 0 1

Misclassification rate

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4791667

KNN V3 Using Regression Variables (Low and XRP.Volume)

master_knnv3= subset(master_knn, select=c(HL.Close,Low,XRP.Volume))
head(master_knnv3)
##      HL.Close       Low XRP.Volume
## 1391        1 -1.804629 -0.5685338
## 1390        1 -1.802205 -0.5681487
## 1389        1 -1.789993 -0.5681778
## 1388        1 -1.785556 -0.5681501
## 1387        1 -1.780911 -0.5682131
## 1384        1 -1.774866 -0.5685537
# Create Training and Testing Sets
num_samples = dim(master_knnv3)[1]
sampling.rate = 0.9
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_knnv3[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_knnv3[testing, ]
# Get the features of the training set
trainingfeatures <- subset(trainingSet, select=c(-HL.Close))
# Get the labels of the training set
traininglabels <- trainingSet$HL.Close
# Get the features of the testing set
testingfeatures <- subset(testingSet, select=c(-HL.Close))
# Load the classification library
library(class)
# call KNN with k=3
predictedLabels = knn(trainingfeatures,testingfeatures,traininglabels,k=3)

Display the predicted Labels

head(predictedLabels)
## [1] 1 0 1 1 1 0
## Levels: 0 1

Misclassification rate

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4479167

Feature Selection

set.seed(7)
# load the library
library(mlbench)
library(caret)
# load the data
data(master_knn)
## Warning in data(master_knn): data set 'master_knn' not found
# define the control using a random forest selection function
control <- rfeControl(functions=rfFuncs, method="cv", number=10)
# run the RFE algorithm
results <- rfe(master_knn[,1:17], master_knn[,18], sizes=c(1:8), rfeControl=control)
# summarize the results
print(results)
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE RMSESD RsquaredSD   MAESD Selected
##          1 1.1377 0.003179 0.3261 0.3477    0.00658 0.10423         
##          2 1.0079 0.016361 0.3158 0.3771    0.02836 0.10000         
##          3 0.9833 0.019386 0.3241 0.3643    0.04547 0.09275         
##          4 0.9652 0.012435 0.3134 0.3720    0.02613 0.10128         
##          5 0.9603 0.008677 0.3128 0.3712    0.01597 0.10005        *
##          6 0.9676 0.007810 0.3172 0.3668    0.01201 0.10322         
##          7 0.9698 0.009991 0.3210 0.3675    0.01536 0.10259         
##          8 0.9637 0.015377 0.3194 0.3678    0.02498 0.10025         
##         17 0.9695 0.018932 0.3368 0.3712    0.03602 0.09967         
## 
## The top 5 variables (out of 5):
##    ETH.Price, SP500.Price, XRP.Volume, LTC.Price, ETH.Volume
# list the chosen features
predictors(results)
## [1] "ETH.Price"   "SP500.Price" "XRP.Volume"  "LTC.Price"   "ETH.Volume"
# plot the results
plot(results, type=c("g", "o"))

# Recursive Feature Selection

master_rfe= master
y= master_rfe$HL.Close
x = subset(master_rfe, select = -c(HL.Close))
normalization = preProcess(x)
x = predict(normalization,x)
x= as.data.frame(x)
head(x)
##           Open      High       Low     Close Volume.BTC Volume.USD ETH.Price
## 1391 -1.792960 -1.787230 -1.804629 -1.789247 -0.9967384 -0.9056995 -1.325329
## 1390 -1.791214 -1.774265 -1.802205 -1.775800 -0.7381414 -0.8824137 -1.324625
## 1389 -1.777715 -1.768744 -1.789993 -1.770477 -0.7943950 -0.8861136 -1.324671
## 1388 -1.772372 -1.763639 -1.785556 -1.765393 -0.8260129 -0.8879762 -1.324249
## 1387 -1.767268 -1.759664 -1.780911 -1.762812 -0.7287799 -0.8782783 -1.323639
## 1384 -1.763956 -1.758787 -1.774866 -1.760654 -0.9485599 -0.8982131 -1.321809
##      ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391 -0.7388230 -1.278820 -0.4207717 -1.159030 -0.5685338    -0.8033421
## 1390 -0.7391833 -1.278050 -0.4205950 -1.159165 -0.5681487    -0.8033421
## 1389 -0.7359696 -1.277857 -0.4208610 -1.158896 -0.5681778    -0.8033421
## 1388 -0.7377002 -1.277665 -0.4208590 -1.159198 -0.5681501    -0.8033421
## 1387 -0.7374346 -1.278242 -0.4208973 -1.159265 -0.5682131    -0.8033421
## 1384 -0.7377641 -1.279976 -0.4210321 -1.159265 -0.5685537    -0.8033421
##      SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391   -1.765237  -0.27908820 -1.0146212  1.27925706
## 1390   -1.771899   0.20060822 -0.9445742 -0.08279469
## 1389   -1.769667   0.03544369 -0.9582197 -0.14232739
## 1388   -1.765401  -0.06983854 -0.9077314 -0.13175724
## 1387   -1.711019  -0.27261078 -0.8995439 -0.15039181
## 1384   -1.726969  -0.74528838 -0.8472362 -0.14915335
head(y)
## [1] 1 1 1 1 1 1
subsets = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18)
set.seed(10)

ctrl = rfeControl(functions = lmFuncs, method = "repeatedcv", repeats = 3, verbose = FALSE)

lmProfile = rfe(x, y, sizes = subsets, rfeControl = ctrl)
lmProfile
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 3 times) 
## 
## Resampling performance over subset size:
## 
##  Variables   RMSE Rsquared    MAE   RMSESD RsquaredSD    MAESD Selected
##          1 0.4984 0.017182 0.4954 0.006789   0.022279 0.006055        *
##          2 0.4986 0.015091 0.4954 0.006878   0.020125 0.006185         
##          3 0.4991 0.011846 0.4956 0.006826   0.013979 0.006199         
##          4 0.5000 0.008700 0.4951 0.007351   0.009308 0.006500         
##          5 0.4999 0.011001 0.4942 0.008156   0.010876 0.007341         
##          6 0.5001 0.011282 0.4936 0.008556   0.012683 0.007617         
##          7 0.5003 0.012203 0.4931 0.009126   0.014069 0.008019         
##          8 0.5007 0.009153 0.4927 0.008666   0.008585 0.007483         
##          9 0.5007 0.009738 0.4923 0.009013   0.010582 0.007648         
##         10 0.5010 0.009486 0.4924 0.009131   0.011287 0.007831         
##         11 0.5009 0.009580 0.4921 0.009023   0.012408 0.007816         
##         12 0.5017 0.008454 0.4927 0.009423   0.011090 0.008061         
##         13 0.5020 0.007977 0.4930 0.009407   0.010644 0.008014         
##         14 0.5024 0.007374 0.4933 0.009250   0.009620 0.007751         
##         15 0.5025 0.007248 0.4935 0.009367   0.009122 0.007891         
##         16 0.5026 0.007254 0.4936 0.009418   0.009115 0.007954         
##         17 0.5026 0.007285 0.4936 0.009430   0.009131 0.007965         
## 
## The top 1 variables (out of 1):
##    Low
predictors(lmProfile)
## [1] "Low"
lmProfile$fit
## 
## Call:
## lm(formula = y ~ ., data = tmp)
## 
## Coefficients:
## (Intercept)          Low  
##     0.54832     -0.02541
head(lmProfile$resample)
##    Variables      RMSE     Rsquared       MAE    Resample
## 1          1 0.4944017 5.723296e-05 0.4919751 Fold01.Rep1
## 18         1 0.4897414 6.199280e-03 0.4877692 Fold02.Rep1
## 35         1 0.5014909 6.588659e-03 0.4981807 Fold03.Rep1
## 52         1 0.5152455 3.860430e-02 0.5102679 Fold04.Rep1
## 69         1 0.4964578 1.985132e-02 0.4935238 Fold05.Rep1
## 86         1 0.4995176 7.608472e-03 0.4966717 Fold06.Rep1
trellis.par.set(caretTheme())
plot(lmProfile, type = c("g", "o"))

# SVM

master_svm = master_cor
head(master_svm)
##         Open    High     Low   Close Volume.BTC Volume.USD HL.Close ETH.Price
## 1391  917.31  923.95  914.69  923.45    3678.36    3385239        1     10.59
## 1390  923.45  971.24  922.83  970.92    6624.94    6298154        1     10.74
## 1389  970.92  991.38  963.84  989.71    5983.96    5835317        1     10.73
## 1388  989.71 1010.00  978.74 1007.66    5623.69    5602317        1     10.82
## 1387 1007.66 1024.50  994.34 1016.77    6731.61    6815466        1     10.95
## 1384 1019.31 1027.70 1014.64 1024.39    4227.33    4321741        1     11.34
##      ETH.Volume LTC.Price LTC.Volume XRP.Price XRP.Volume Google.Search
## 1391   437746.5      4.03   23569.63   0.00645    3249.53             4
## 1390   413350.2      4.07   35332.93   0.00641   13926.48             4
## 1389   630953.7      4.08   17621.75   0.00649   13118.79             4
## 1388   513774.8      4.09   17753.63   0.00640   13887.87             4
## 1387   531755.4      4.06   15202.71   0.00638   12139.60             4
## 1384   509447.5      3.97    6224.56   0.00638    2697.23             4
##      SP500.Price SP500.Volume Gold.Price Gold.Volume
## 1391     2280.90   3591270000     1193.2       50503
## 1390     2278.87   4087450000     1208.6        3212
## 1389     2279.55   3916610000     1205.6        1145
## 1388     2280.85   3807710000     1216.7        1512
## 1387     2297.42   3597970000     1218.5         865
## 1384     2292.56   3109050000     1230.0         908
# Normalize All the Attributes ( NOT HL.CLOSE)
master_svm$Open = (master_svm$Open-mean(master_svm$Open))/sd(master_svm$Open)
master_svm$High = (master_svm$High-mean(master_svm$High))/sd(master_svm$High)
master_svm$Low = (master_svm$Low-mean(master_svm$Low))/sd(master_svm$Low)
master_svm$Close = (master_svm$Close-mean(master_svm$Close))/sd(master_svm$Close)
master_svm$Volume.BTC = (master_svm$Volume.BTC-mean(master_svm$Volume.BTC))/sd(master_svm$Volume.BTC)
master_svm$Volume.USD = (master_svm$Volume.USD-mean(master_svm$Volume.USD))/sd(master_svm$Volume.USD)
master_svm$ETH.Price = (master_svm$ETH.Price-mean(master_svm$ETH.Price))/sd(master_svm$ETH.Price)
master_svm$ETH.Volume = (master_svm$ETH.Volume-mean(master_svm$ETH.Volume))/sd(master_svm$ETH.Volume)
master_svm$LTC.Price = (master_svm$LTC.Price-mean(master_svm$LTC.Price))/sd(master_svm$LTC.Price)
master_svm$LTC.Volume = (master_svm$LTC.Volume-mean(master_svm$LTC.Volume))/sd(master_svm$LTC.Volume)
master_svm$XRP.Price = (master_svm$XRP.Price-mean(master_svm$XRP.Price))/sd(master_svm$XRP.Price)
master_svm$XRP.Volume = (master_svm$XRP.Volume-mean(master_svm$XRP.Volume))/sd(master_svm$XRP.Volume)
master_svm$Google.Search = (master_svm$Google.Search-mean(master_svm$Google.Search))/sd(master_svm$Google.Search)
master_svm$SP500.Price = (master_svm$SP500.Price-mean(master_svm$SP500.Price))/sd(master_svm$SP500.Price)
master_svm$SP500.Volume = (master_svm$SP500.Volume-mean(master_svm$SP500.Volume))/sd(master_svm$SP500.Volume)
master_svm$Gold.Price = (master_svm$Gold.Price-mean(master_svm$Gold.Price))/sd(master_svm$Gold.Price)
master_svm$Gold.Volume = (master_svm$Gold.Volume-mean(master_svm$Gold.Volume))/sd(master_svm$Gold.Volume)

Make sure that the value that you are trying to predict is a factor

master_svm$HL.Close = factor(master_svm$HL.Close, levels=c(0,1), labels = c("L", "H"))
levels((master_svm$HL.Close))
## [1] "L" "H"

Create Training and testing sets

# Create Training and Testing Sets
num_samples = dim(master_svm)[1]
sampling.rate = 0.8
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_svm[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_svm[testing, ]

Linear SVM

# Load the SVM Library
library(e1071)
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:Hmisc':
## 
##     impute
# Apply a linear SVM with an error cost of 20
svmModel <- svm(HL.Close~., data=trainingSet, kernel="linear", cost=20)
# Perform predictions for the testing set
predictedLabels <-predict(svmModel, testingSet)
predictedLabels
## 1391 1389 1383 1382 1380 1376 1366 1355 1349 1338 1333 1324 1320 1319 1314 1313 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
## 1305 1291 1289 1286 1275 1270 1269 1255 1251 1247 1241 1226 1221 1213 1212 1207 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
## 1205 1186 1178 1177 1172 1165 1163 1158 1146 1132 1128 1124 1123 1108 1097 1076 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    L 
## 1065 1059 1052 1051 1034 1023 1011 1009 1005  990  988  982  976  974  971  941 
##    L    L    H    L    H    L    H    H    H    H    H    H    H    L    H    L 
##  940  929  921  911  900  890  884  870  869  864  858  855  850  849  848  841 
##    L    L    H    H    L    H    H    H    H    H    L    H    L    L    L    L 
##  836  831  820  817  814  813  809  806  802  789  788  782  760  750  746  745 
##    H    L    L    L    L    L    L    L    L    L    L    H    L    L    L    L 
##  704  695  655  653  652  649  648  647  633  619  617  610  607  604  578  575 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
##  572  565  563  547  529  526  519  509  499  495  491  464  463  460  453  442 
##    H    H    H    L    L    L    L    H    L    L    L    L    L    L    L    L 
##  438  429  410  409  402  401  396  387  382  368  367  355  338  305  285  275 
##    L    L    H    L    H    H    H    L    L    H    H    H    H    H    H    H 
##  269  263  260  254  253  248  243  242  240  233  227  207  205  192  190  184 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
##  172  164  155  149  143  142  131  130  127  117  113  107  106  103  101  100 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
##   96   82   80   71   65   64   60   58   50   47   44   31   11    4    3 
##    H    H    H    H    H    H    H    H    H    H    H    H    L    L    H 
## Levels: L H
#Calculate misclassification rate
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.513089

Polynomial SVM

# Apply a polynomial SVM with an error cost of 20
svmModel <- svm(HL.Close~., data=trainingSet, kernel="polynomial", cost=20)

Let us now do some predictions on the test set

# Perform prdictions for the testing set
predictedLabels <-predict(svmModel, testingSet)
predictedLabels
## 1391 1389 1383 1382 1380 1376 1366 1355 1349 1338 1333 1324 1320 1319 1314 1313 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
## 1305 1291 1289 1286 1275 1270 1269 1255 1251 1247 1241 1226 1221 1213 1212 1207 
##    H    H    H    H    H    H    H    L    L    H    H    H    L    H    H    H 
## 1205 1186 1178 1177 1172 1165 1163 1158 1146 1132 1128 1124 1123 1108 1097 1076 
##    H    H    H    L    H    H    H    H    H    H    H    H    H    H    H    L 
## 1065 1059 1052 1051 1034 1023 1011 1009 1005  990  988  982  976  974  971  941 
##    L    L    L    L    H    H    H    H    H    L    L    H    H    H    H    L 
##  940  929  921  911  900  890  884  870  869  864  858  855  850  849  848  841 
##    H    H    L    L    H    H    H    H    H    H    L    H    L    L    L    H 
##  836  831  820  817  814  813  809  806  802  789  788  782  760  750  746  745 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
##  704  695  655  653  652  649  648  647  633  619  617  610  607  604  578  575 
##    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H    H 
##  572  565  563  547  529  526  519  509  499  495  491  464  463  460  453  442 
##    H    H    H    H    H    H    H    H    L    L    L    H    L    L    L    L 
##  438  429  410  409  402  401  396  387  382  368  367  355  338  305  285  275 
##    L    L    L    H    H    H    H    L    L    L    L    H    H    L    L    H 
##  269  263  260  254  253  248  243  242  240  233  227  207  205  192  190  184 
##    H    L    L    H    H    H    H    H    H    H    H    H    L    H    H    H 
##  172  164  155  149  143  142  131  130  127  117  113  107  106  103  101  100 
##    H    H    L    H    L    H    H    H    H    L    H    H    H    H    H    H 
##   96   82   80   71   65   64   60   58   50   47   44   31   11    4    3 
##    L    L    H    H    H    L    H    H    H    H    H    L    H    H    L 
## Levels: L H

We compute the misclassification rate (the rate of incorrect predictions).

# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.486911

Radial SVM

# Apply a radial SVM with an error cost of 20
svmModel <- svm(HL.Close~., data=trainingSet, kernel="radial", cost=20)
# Perform predictions for the testing set
predictedLabels <-predict(svmModel, testingSet)
# Get the number of data points in the test set
sizeTestSet = dim(testingSet)[1]
# Get the number of data points that are misclassified
error = sum(predictedLabels != testingSet$HL.Close)
# Calculate the misclassification rate
misclassification_rate = error/sizeTestSet
# Display the misclassification rate
print(misclassification_rate)
## [1] 0.4973822

Time Series

library("anytime")
library("bsts")
## Loading required package: BoomSpikeSlab
## Loading required package: Boom
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
## 
##     SP500
## 
## Attaching package: 'Boom'
## The following object is masked from 'package:stats':
## 
##     rWishart
## 
## Attaching package: 'BoomSpikeSlab'
## The following object is masked from 'package:stats':
## 
##     knots
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: xts
## 
## Attaching package: 'bsts'
## The following object is masked from 'package:BoomSpikeSlab':
## 
##     SuggestBurn
library("car")
## Loading required package: carData
library("caret")
library("forecast")
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'forecast'
## The following object is masked _by_ '.GlobalEnv':
## 
##     gold
library("tseries")
library("TTR")
master_reg <- read.csv("Coinbase_BTCUSD_d.csv")
num_samples = dim(master_reg)[1]
sampling.rate = 0.996
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_reg[training, ]
testing <- setdiff(1:num_samples,training)
testingSet <- master_reg[testing, ]
num_samples = dim(master_reg)[1]
sampling.rate = 0.996
training <- sample(1:num_samples, sampling.rate * num_samples, replace=FALSE)
trainingSet <- master_reg[training, ]
trainingSet = subset(trainingSet, select = -c(Timestamp, Symbol, Volume.BTC))

testing <- setdiff(1:num_samples,training)
testingSet <- master_reg[testing, ]
testingSet = subset(testingSet, select = -c(Timestamp, Symbol,Open, High, Low, Volume.BTC, Volume.USD))
testdata <- testingSet[,2]

trainingSet$Date <- as.Date(anytime(trainingSet$Date))
testingSet$Date <- as.Date(anytime(testingSet$Date))
trainingSet$Volume <- gsub(",", "", trainingSet$Volume.USD)
trainingSet$Volume <- as.numeric(trainingSet$Volume.USD)



trainingSet <- xts(trainingSet[, -1], order.by = as.POSIXct(trainingSet$Date)) 
trainingSetResult <- ts(trainingSet[,4], frequency = 365,start = 2015)
dects <- decompose(trainingSetResult) 
plot(dects)

holt_result <-  holt(trainingSet[1000:2000,'Close'], type = "additive", damped = F) 

holt_forecast <- forecast(holt_result, h = 9)

holtdf <- as.data.frame(holt_forecast)
holtdf
##          Point Forecast    Lo 80     Hi 80    Lo 95    Hi 95
## 86486401       9453.302 8931.660  9974.945 8655.518 10251.09
## 86572801       9458.091 8720.446 10195.736 8329.960 10586.22
## 86659201       9462.880 8559.451 10366.309 8081.204 10844.56
## 86745601       9467.668 8424.451 10510.885 7872.205 11063.13
## 86832001       9472.457 8306.063 10638.850 7688.612 11256.30
## 86918401       9477.245 8199.473 10755.017 7523.062 11431.43
## 87004801       9482.034 8101.823 10862.245 7371.183 11592.88
## 87091201       9486.823 8011.249 10962.396 7230.127 11743.52
## 87177601       9491.611 7926.457 11056.765 7097.914 11885.31
plot(holtdf, ylim = c(0,20000)) 

holtfdf <- cbind(testingSet, holtdf[,1])
holtfdf
##            Date    Close holtdf[, 1]
## 267  2020-02-28  8708.89    9453.302
## 289  2020-02-06  9763.01    9458.091
## 329  2019-12-28  7302.67    9462.880
## 996  2018-03-01 10895.92    9467.668
## 1025 2018-01-31 10099.99    9472.457
## 1552 2016-08-22   585.12    9477.245
## 1567 2016-08-07   593.90    9482.034
## 1772 2016-01-15   373.43    9486.823
## 1948 2015-07-23   276.91    9491.611
accuracy(holtdf[,1], testdata)
##                 ME    RMSE      MAE       MPE     MAPE
## Test set -4072.475 6089.89 4597.075 -977.7898 982.7775
ggplot() + geom_line(data = holtfdf, aes(Date, holtfdf[,2]), color = "blue") + geom_line(data = holtfdf, aes(Date, holtfdf[,3]), color = "Dark Red")

```